-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathagent.hpp
188 lines (134 loc) · 5.07 KB
/
agent.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#ifndef __AGENT_HPP__
#define __AGENT_HPP__
#include <iostream>
#include "main.hpp"
#include "predict.hpp"
#include "search.hpp"
#include "util.hpp"
class ContextTree;
class ModelUndo;
class DecisionNode;
class Agent {
public:
// construct a learning agent from the command line arguments
Agent(options_t & options);
// destruct the agent and the corresponding context tree
~Agent(void);
// current lifetime of the agent in cycles
lifetime_t lifetime(void) const;
// current lifetime of the agent in cycles
double UCBWeight(void) const;
// the total accumulated reward across an agents lifespan
reward_t reward(void) const;
// the average reward received by the agent at each time step
reward_t averageReward(void) const;
// maximum reward in a single time instant
reward_t maxReward(void) const;
// minimum reward in a single time instant
reward_t minReward(void) const;
// number of distinct actions
unsigned int numActions(void) const;
// the length of the stored history for an agent
size_t historySize(void) const;
// length of the search horizon used by the agent
size_t horizon(void) const;
// generate an action uniformly at random
action_t genRandomAction(void);
// generate a percept distributed according
// to our history statistics
percept_t* genPercept(void) const;
// generate a percept distributed to our history statistics, and
// update our mixture environment model with it
percept_t* genPerceptAndUpdate(void);
// update the internal agent's model of the world
// due to receiving a percept or performing an action
void modelUpdate(percept_t observation, percept_t reward);
void modelUpdate(action_t action);
// revert the agent's internal model of the world
// to that of a previous time cycle, false on failure
bool modelRevert(const ModelUndo &mu);
// Get the Context tree depth
size_t maxTreeDepth(void);
// Calculate the probability of next symbol
double getProbNextSymbol(void);
// resets the agent
void reset(void);
// reset, but don't kill the CTW;
void newEpisode(void);
// Get the time out
double timeout(void);
// probability of selecting an action according to the
// agent's internal model of it's own behaviour
double getPredictedActionProb(action_t action);
// get the agent's probability of receiving a particular percept
double perceptProbability(percept_t observation, percept_t reward) const;
// Return context tree
ContextTree * contextTree();
// return the search tree
DecisionNode * searchTree();
// reset the search tree to a new root node
void searchTreeReset();
// prune the tree to the subtree of the root corresponding to
// the given action
void searchTreePrune(action_t action, obsrew_t obsrew);
void setOptions(options_t & options);
private:
// action sanity check
bool isActionOk(action_t action) const;
// reward sanity check
bool isRewardOk(reward_t reward) const;
// encoding/decoding actions and percepts to/from symbol lists
void encodeAction(symbol_list_t &symlist, action_t action) const;
void encodePercept(symbol_list_t &symlist, percept_t observation,
percept_t reward) const;
action_t decodeAction(const symbol_list_t &symlist) const;
percept_t decodeReward(const symbol_list_t &symlist) const;
// agent properties
unsigned int m_actions; // number of actions
unsigned int m_actions_bits; // number of bits to represent an action
unsigned int m_obs_bits; // number of bits to represent an observation
unsigned int m_rew_bits; // number of bits to represent a reward
size_t m_horizon; // length of the search horizon
double m_timeout; // timeout value for MC search
DecisionNode *m_st; // head node of the search tree
// the max CTW tree depth
size_t m_max_tree_depth;
// Context Tree representing the agent's beliefs
ContextTree *m_ct;
// How many time cycles the agent has been alive
lifetime_t m_time_cycle;
// The total reward received by the agent
reward_t m_total_reward;
// True if the last update was a percept update
bool m_last_update_percept;
// The weight (C) used in the UCB algorithm
double m_UCBWeight;
};
// used to store sufficient information to revert an agent
// to a copy of itself from a previous time cycle
class ModelUndo {
public:
// construct a save point
ModelUndo(const Agent &agent);
// saved state lifetime accessor
lifetime_t lifetime(void) const {
return m_lifetime;
}
// saved state reward accessor
reward_t reward(void) const {
return m_reward;
}
// saved state history size accessor
size_t historySize(void) const {
return m_history_size;
}
bool lastUpdate(void) const {
return m_last_update_percept;
}
private:
lifetime_t m_lifetime;
reward_t m_reward;
size_t m_history_size;
bool m_last_update_percept;
};
#endif // __AGENT_HPP__