Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Agent Adaptation(2) #107

Merged
merged 8 commits into from
Oct 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/rl/flowchart agent-adapt.drawio
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<mxfile host="Electron" modified="2021-10-05T05:49:41.655Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/14.9.6 Chrome/89.0.4389.128 Electron/12.0.16 Safari/537.36" etag="gIkDWZGqsJbABNQQ9ofr" version="14.9.6" type="device"><diagram id="C5RBs43oDa-KdzZeNtuy" name="Page-1">7Vxbd6I6FP41PrYLAkF8tPYyp9POctrO6fTprAhBmSJxQqw6v/4ECXKJxeioqO3TkE2AZn/727fEaRid4fSGotHgnrg4aADNnTaMywYAwIYW/yeWzBKJDrRWIulT3xWyTPDo/8FCqAnp2HdxVJjICAmYPyoKHRKG2GEFGaKUTIrTPBIUvzpCfSwJHh0UyNJn32UDIdWtVnbjC/b7g/TTFjSTO0OUzhZLiQbIJZOcyLhqGB1KCEuuhtMODmL1pYp5/mf2HNy9Wje336Pf6MfF16dv/54lL7te55HFGigO2cav1u61a4/1vPGDdnd727n+xcDZmSnwfUPBWGhMLJbNUhVil2tUDAllA9InIQquMukFJePQxfF3ND7K5twRMuJCnQt/YcZmwjzQmBEuGrBhIO4qLlAoIiJj6uCKVQnAGKJ9zCrmiffFC8xZi1DfDSZDzOiMT6A4QMx/K5oUEpbZX8zLlM8vhP7XwEKToHjkK2ASHhGj5HVhzaCoSW6ko3jecNqPGX3uBWTiDPhrzqP4Zf/FcyYDn+HHEZrrcMKnrY3AG6YMTyt1Ju4aTbEs4UBsMZzkyKgJ2SDHQ0vbkZZNScvfSB0mj6c++ykej69f4utzKEaX09yty1lu0MXU55rAVMi2TJ3UR6+ijqFIHWEGZ9q5CU2YPKRMJ/G6LvH52rIpxPMizCRLWHxVyTi+PEzNp/AWA/zWbN/f/3i59ZsL886M44UHr9ocYkm7u4V5qUK0VahuBGmbUjTLTRjFAEdLEBffMUHRiRigFOrWm88vkr9gl/ZVpfacfbVdNOKK8xnXhxVwKC56lF/14ysSNozrbbl/Fzt+5MevrNX/L7KpVQFgIdx6BDAkEB4wG9OQy65REOFtaZz76KEfIkboOzr3/CDokCC+zz9jeLaDHSd+5fx7uTs9m/vO2JN4JGSp51nTJahDplslyMy6YzaEEmT1OeWt+WCoGGrNpmKs3U+aKmPR5cppc1GHDLly+J8Qe5m27M+uwjefknAYq4//Ccvuiwq0hG2G3Hvpaw4l1ItIMGa4TR2B5FyajcwFxfKULpHRRdj2lpLRcmzc8/ZFxrL/bFq1kxFIBlBTBh3y5fzM0uZ4+JJmyvEgS6LnozSL3jKNbUUa66YijbPkyoYta5P8ap8pDZTDaa05895xhWvDekhlUNXSc5AuXPMyvz7PYWPkTi1Zhardih0mqzIW9XjbLbNLV+7RWevzC7TMQ6eYKXf7TsNtKgMLVNPaXAsJ2BvFw7/tN+jpdoZqw6H8QI0dB9nOuiTwndnH8uS6dgCuvHmirhwoMt7YwJVDePgZ8MF0jQ+kIlI2iA1i+6Yl0bohoNkqOxBYHQIsDVY+UGMIkAv2j5HON80ihi2t/hhwkp3MtLWxmvBAkfD7aWXq8mZwFyS9zKwXuZQeSZOzjNwH7lNCWHR/LdV9np31KY3WSVLNUqQaODCqyQeNukZCNVGPAK1uZhV4sgnT9kUuXVeNZLs7RiO3fU+imElps3pXTpVfx1XMmHJIPI2+lDqwqqcC99+XkooSoFUXJaseqK8oSeH4YH2pcklyCH0pIPelFgdinuh4f+dhpMBqg55hWUtCMcS2a9a1Ba8DWHf0NeRs6vMMUx6zllEq/WsvR0y5S/jJsirPWD/L0hTvk2WKLDuEukSu+k8jf01PAa7usQnVntKBlNQVnFq1qQ7q+ofHjuIUhJx6Vm5NfB+jYH5UXiOvx18MlBPL/e5PLP29hxzxjv8HMO//sEWBePYx+c6lKz2URt3fbU7XbwlJB7vKFFoWtDexhnXbQFnTQDgOc8XetPTAcexmV8GVM+d5uDjroYhbZ9Vu3Z7jhRQHVG1WOV6Yu4wXfJj9tD0BL/svAoyr/wE=</diagram></mxfile>
<mxfile host="Electron" modified="2021-10-08T14:37:05.725Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/14.9.6 Chrome/89.0.4389.128 Electron/12.0.16 Safari/537.36" etag="PMO8ycJ8pl6nL-kLunmc" version="14.9.6" type="device"><diagram id="C5RBs43oDa-KdzZeNtuy" name="Page-1">7Vxre6I4FP41fmwf7uJHay+zbjtrL7Md+2UfhKi0SNwQq+6v3yDhIqEYXARkZ74MOQlgznvOe05OQjvyYLG5Q8Zy/gAt4HQkwdp05OuOJEm6qpH/fMk2kIiS0AskM2RbVBYLnu1/ABUKVLqyLeDtDcQQOthe7gtN6LrAxHsyAyG43h82hc7+W5fGDDCCZ9NwWOmrbeE5lYpaL+74BuzZPHy1pipBz8IIR9OpeHPDguuESL7pyAMEIQ6uFpsBcHz1hYp5/W376tx/aHfDR+9v48fV7y/f/7wIHnZb5JZoDgi4+OhHCw/C7RRPpqsn4X44HNy+Y+niQqH4fhrOimqMThZvQxUCi2iUNiHCcziDruHcxNIrBFeuBfz3CKQVj7mHcEmEIhG+A4y31DyMFYZENMcLh/ZyTpAqwoMrZIKcWVHAsIFmAOeMo8/zJ5iwFqq+OwAXAKMtGYCAY2D7c9+kDGqZs2hcrHxyQfVfAAuBgeKZzAAzeHgYwY/ImqV9TRIjXfrjFpuZ79GXUweuzTl5zKXnP+wvf8x6bmPwvDR2OlyTYYUR+AQIg02uzmivLNNpUQLRaXOdcEaByuYJP9SEE2lZYbT8HdZh8mBj45/0dv967F9fqrR1vUl0XW8TjRFANtEEQFRWsuuEHH3IdWRO16FmcCFcKqqiBjdxuxN93AjaZG7xEDidegAzlhC9lcs4vj1tlBd3CCTw2e0/PPwYD+1uZN6xcYxJ8Dp/QsxANXP+QmFUj4K0j5CxTQxY+gB7GYiHJKLvk4gspUJdsfHkIvgFp7SvPBwS9tW3jCVRnI2JPjSHYHM1QeRq5l9BtyPflkX/FjBtz/YfWSv/R9nUoQAQCUuPADIDwhPAK+QS2a3heKAsjROOXtiugSH6QudT23EG0PH7yWvkqW4C0/QfuXtfomeiE+70mWQKXRwyD29WWBgyUUpBptQds1WVgawFpKxyhlqly8nK1aSpLBYj8mP6RDSAC6Ic8hMIyzBkduN+2gi6C1935P1Z/XT5mQI2hu2r3DUBkTHxoLPCoI9MCuNOGreUyL+S/pzyRMsA+jTTEzVTB5NpVZ6YJs+uVrsnSgz6NaXPLpnOzzhn9pvjME32G3EGvWuFKXTJPqxz+rCoFM6sdLWnHZNcVZnPqGwsbUfCzIurWhjWJq2B8qaegDSm5jRpG3726sPWtjRV5a1TnDBNZYGoh2pLdi2RuzqnFXcuqac03b8Uts7XDs4Ui1cPFEk/Ksb91wKCGO5P8FYQ0jcULyFwvEJNUUngIfS2ExQjWEMcQcc2MyoRreV5UWgA0XcbQvTH5tS71qnr0hJn1FCk4lFDVU+WaTNur+u9y17in7pnj72KOUBtTMW7IQs6bjs7Ijs5dkVXNNp11TTJqfnRTtXU3BtqLJiz9Yb/wWqkK+wD2BPqD1KtLMGGZZnD3s4bVaqpwYrsLvZIDoqwX7vHbquJVmd/1Vgj7hNTEZh3g+pkNVa510pX0zhdTWqYq7EnpEZK4GrhaklIehaxG2I2ST+sw9f2POcY36vK3USRN7ad7kQQW8RuRaEtdKRzWzKVVmhjg2Q7Cm38wPIecKy+IsesUSQhf41y6Ib61ighHBl1NIHwal5K1toVSxPqagpb24gO+rygVXXnfJgoq0sTWdMy4rIKdEupKvNlMIsK0PWFYtaVfp3NSmKm91KYNSB9Ypcr7Qiz4bmrw8WBio7NVnqyg90cakVSzA9q8RM7Z7H7zG42RemJv6hNZyiPK8PZHU4W4Mf5pynp7b9qC6uZR+7ZiFfXBkz4QUr0Eco40VPFByl5nyQc9Ff9rDn46g2P7/8YeuLw6c19e+yP3m+GWRv0dXJwSrWnxThbIYcgVXSpSQycN9kEqDv2vZgYHsEor2rfOvpVKqXfTJNi0TjnBQcDBS+K/AdmKl0kZrMAu64/c1L8muw4SFHOxq/03ZG8H5ksfYnB5si1jfzv+b/YJUnBdU57IuX7WHpPRKlyCzIb1sZkpaf2MpnTy7oHs0kx5KVGpB7ZsDZlp6vUk4blLlwqNbWcImierfWURhUa8mabtSnSmH2Q8qk8XVNvQIbLFnzOeBekfMTSFfVTBl/SjP9cUOBo8Z9dkm/+BQ==</diagram></mxfile>
Binary file modified doc/rl/flowchart agent-adapt.pdf
Binary file not shown.
6 changes: 3 additions & 3 deletions examples/rl/Howto 10 - (RL) Train using SB3 Wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
## -- yyyy-mm-dd Ver. Auth. Description
## -- 2021-09-29 0.0.0 MRD Creation
## -- 2021-10-07 1.0.0 MRD Released first version
## -- 2021-10-08 1.0.1 DA Take over the cycle limit from the environment
## -------------------------------------------------------------------------------------------------

"""
Ver. 1.0.0 (2021-10-07)
Ver. 1.0.1 (2021-10-08)

This module shows how to train with SB3 Wrapper for On-Policy Algorithm
"""
Expand Down Expand Up @@ -75,7 +76,7 @@ def _setup(self, p_mode, p_ada, p_logging):
myscenario = MyScenario(
p_mode=Environment.C_MODE_SIM,
p_ada=True,
p_cycle_limit=500,
p_cycle_limit=-1, # get cycle limit from environment
p_visualize=False,
p_logging=False
)
Expand All @@ -84,7 +85,6 @@ def _setup(self, p_mode, p_ada, p_logging):
training = Training(
p_scenario=myscenario,
p_episode_limit=2,
p_cycle_limit=500,
p_collect_states=True,
p_collect_actions=True,
p_collect_rewards=True,
Expand Down
23 changes: 16 additions & 7 deletions src/mlpro/rl/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,18 @@
## -- 2021-09-25 1.3.4 MRD Remove Previous state into the buffer. Add Next state to the buffer
## -- Remove clearing buffer on every reset. The clearing buffer should
## -- be controlled from the policy
## -- 2021-10-05 1.4.0 DA Enhancements around model-based agents:
## -- 2021-10-05 1.4.0 DA Various changes:
## -- - Class State: new attributes done, broken and related methods
## -- - New class ActionPlanner
## -- - Class Agent: method adapt() implemented
## -- - Class Agent: preparation for model-based mode
## -- Introduction of method Environment.get_cycle_limit()
## -- 2021-10-05 1.4.1 SY Bugfixes and minor improvements
## -- 2021-10-08 1.4.2 DA Class Scenario/constructor/param p_cycle_limit: new value -1
## -- lets class get the cycle limit from the env
## -------------------------------------------------------------------------------------------------

"""
Ver. 1.4.1 (2021-10-05)
Ver. 1.4.2 (2021-10-08)

This module provides model classes for reinforcement learning tasks.
"""
Expand Down Expand Up @@ -799,7 +801,7 @@ def __init__(self, p_logging=True):


## -------------------------------------------------------------------------------------------------
def compute_action(self, p_state:State, p_policy:Policy, p_envmodel:EnvModel, p_depth) -> Action:
def compute_action(self, p_state:State, p_policy:Policy, p_envmodel:EnvModel, p_depth, p_width) -> Action:
"""
Computes a path of actions with defined length that maximizes the reward of the given
environment model.
Expand All @@ -809,6 +811,7 @@ def compute_action(self, p_state:State, p_policy:Policy, p_envmodel:EnvModel, p_
p_policy Poliy of an agent
p_envmodel Environment model
p_depth Planning depth (=length of action path to be predicted)
p_width Planning width (=number of alternative actions per planning level)
"""

raise NotImplementedError
Expand All @@ -833,14 +836,15 @@ class Agent(Policy):
C_NAME = ''

## -------------------------------------------------------------------------------------------------
def __init__(self, p_policy:Policy, p_envmodel:EnvModel=None, p_action_planner:ActionPlanner=None, p_planning_depth=0, p_name='', p_id=0, p_ada=True,
def __init__(self, p_policy:Policy, p_envmodel:EnvModel=None, p_action_planner:ActionPlanner=None, p_planning_depth=0, p_planning_width=0, p_name='', p_id=0, p_ada=True,
p_logging=True):
"""
Parameters:
p_policy Policy object
p_envmodel Optional environment model object
p_action_planner Optional action planner object (obligatory for model based agents)
p_planning_depth Optional planning depth (obligatory for model based agents)
p_planning_width Optional planning width (obligatory for model based agents)
p_name Optional name of agent
p_id Unique agent id (especially important for multi-agent scenarios)
p_ada Boolean switch for adaptivity
Expand All @@ -865,6 +869,7 @@ def __init__(self, p_policy:Policy, p_envmodel:EnvModel=None, p_action_planner:A
self._envmodel = p_envmodel
self._action_planner = p_action_planner
self._planning_depth = p_planning_depth
self._planning_width = p_planning_width

self._set_id(p_id)

Expand Down Expand Up @@ -966,7 +971,7 @@ def compute_action(self, p_state:State) -> Action:

else:
# 1.2 With action planner
self._previous_action = self._action_planner.compute_action(p_state, self._policy, self._envmodel, self._planning_depth)
self._previous_action = self._action_planner.compute_action(p_state, self._policy, self._envmodel, self._planning_depth, self._planning_width)


# 2 Outro
Expand Down Expand Up @@ -1264,7 +1269,7 @@ def __init__(self, p_mode=Environment.C_MODE_SIM, p_ada=True, p_cycle_len:timede
p_mode Operation mode of environment (see Environment.C_MODE_*)
p_ada Boolean switch for adaptivity of agent
p_cycle_len Fixed cycle duration (optional)
p_cycle_limit Maximum number of cycles (0=no limit)
p_cycle_limit Maximum number of cycles (0=no limit, -1=get limit from env)
p_visualize Boolean switch for env/agent visualisation
p_logging Boolean switch for logging functionality
"""
Expand All @@ -1280,6 +1285,10 @@ def __init__(self, p_mode=Environment.C_MODE_SIM, p_ada=True, p_cycle_len:timede
# 1 Setup entire scenario
self._setup(p_mode, p_ada, p_logging)

# 2 Finalize cycle limit
if self._cycle_limit == -1:
self._cycle_limit = self._env.get_cycle_limit()

# 2 Init timer
if self._env.get_mode() == Environment.C_MODE_SIM:
t_mode = Timer.C_MODE_VIRTUAL
Expand Down
4 changes: 3 additions & 1 deletion src/mlpro/rl/wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
## -- Redefine also _recognize_space() from WrEnvGYM2MLPro
## -- 2021-10-07 1.3.4 SY Update WrEnvMLPro2PZoo() following above changes (ver. 1.3.3)
## -- 2021-10-07 1.4.0 MRD Implement WrPolicySB32MLPro to wrap the policy from Stable-baselines3
## -- 2021-10-08 1.4.1 DA Correction of wrapper WREnvGYM2MLPro
## -------------------------------------------------------------------------------------------------

"""
Expand Down Expand Up @@ -150,12 +151,13 @@ def _simulate_reaction(self, p_action:Action):
observation, reward_gym, done, info = self._gym_env.step(action_gym)
except:
observation, reward_gym, done, info = self._gym_env.step(np.atleast_1d(action_gym))
self._state.set_done(done)

obs = DataObject(observation)

# 3 Create state object from Gym observation
state = State(self._state_space)
state.set_values(obs.get_data())
state.set_done(done)
self._set_state(state)

# 4 Create and store reward object
Expand Down