-
Notifications
You must be signed in to change notification settings - Fork 0
/
multi_armed_bandit.py
158 lines (138 loc) · 6.87 KB
/
multi_armed_bandit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import numpy as np
import matplotlib.pyplot as plt
#Multi-Armed Bandit Problem
#We define a class called Bandit that has:
# - the average of rewards Q(a)
# - the standard deviation that is equal to 1
# - the array of random rewards normally distributed
class Bandit:
def __init__(self, average, standard_deviation=1):
self.average = average
self.standard_deviation = standard_deviation
def give_reward(self):
rewards = np.random.normal(self.average, self.standard_deviation)
return rewards
#We create 10 bandits with a mean of 0 and a variance of 1
def create_bandits(num_bandits=10, average=0, standard_deviation=1):
#we create a vector of averages
averages = np.random.normal(average,standard_deviation,num_bandits)
#we create a vector of standart deviation
standard_deviations = [standard_deviation]*num_bandits
#we create the bandits and we store them into an array
bandits = []
for i in range(num_bandits):
bandits.append(Bandit(averages[i],standard_deviations[i]))
return bandits, averages
#We create an initial estimate for Q = initial estimate
def initial_estimate(num_bandits, value_estimate):
average_estimate = [value_estimate]*num_bandits #Q
number_bandit_choices = [0]*num_bandits #N
return average_estimate, number_bandit_choices
#We apply the greedy policy, so we decide to take the action that give us the max reward
def greedy(average_estimate, bandits, number_bandit_choices):
choice = np.random.choice(np.argwhere(average_estimate == np.amax(average_estimate)).flatten().tolist()) #flatten the results
reward = bandits[choice].give_reward()
number_bandit_choices[choice] += 1
average_estimate[choice] = (number_bandit_choices[choice]-1)/number_bandit_choices[choice]*average_estimate[choice]+1/number_bandit_choices[choice]*reward
return reward, number_bandit_choices, average_estimate
def epsilon_greedy(average_estimate, bandits, number_bandit_choices, epsilon):
p = np.random.uniform()#this will give us a number between 0 and 1 uniform distributed
if p>epsilon:
choice = np.random.choice(np.argwhere(average_estimate == np.amax(average_estimate)).flatten().tolist())#exploitation
else:
choice = np.random.choice(len(bandits))#exploration
reward = bandits[choice].give_reward()
number_bandit_choices[choice] += 1
average_estimate[choice] = (number_bandit_choices[choice]-1)/number_bandit_choices[choice]*average_estimate[choice]+1/number_bandit_choices[choice]*reward
return reward, number_bandit_choices, average_estimate
#We apply the UCB policy that chooses the action with the highest margin of error => the more you choose one action the least it chooses it
def UCB(average_estimate, bandits, number_bandit_choices, c, potentials, i): #c is a constant
choice = np.random.choice(np.argwhere(potentials == np.amax(potentials)).flatten()) #flatten the results #instead of putting the average estimate I put the potentials
reward = bandits[choice].give_reward()
number_bandit_choices[choice] += 1
average_estimate[choice] = (number_bandit_choices[choice]-1)/number_bandit_choices[choice]*average_estimate[choice]+1/number_bandit_choices[choice]*reward
for j in range(len(bandits)):
if number_bandit_choices[j] > 0:
potentials[j] = average_estimate[j]+c*np.sqrt((np.log(i+1))/number_bandit_choices[j])#i is the time, j is all the bandits (the positions of the bandits)
return reward, number_bandit_choices, average_estimate, potentials
fig = plt.figure()
def question_one():
average_rewards = [0]*1000
for j in range(1,2001):
bandits, averages = create_bandits()
average_estimate, number_bandit_choices = initial_estimate(10, 0) #Q(a)=0
rewards = []
for i in range(1000):
reward, number_bandit_choices, average_estimate = greedy(average_estimate, bandits, number_bandit_choices)
rewards.append(reward)
average_rewards = np.multiply((j-1)/j,average_rewards)+np.multiply(1/j,rewards)
plt.plot(average_rewards, label='greedy Q(a)=0')
question_one()
def question_two():
average_rewards = [0]*1000
for j in range(1,2001):
bandits, averages = create_bandits()
average_estimate, number_bandit_choices = initial_estimate(10, 5) #Q(a)=5
rewards = []
for i in range(1000):
reward, number_bandit_choices, average_estimate = greedy(average_estimate, bandits, number_bandit_choices)
rewards.append(reward)
average_rewards = np.multiply((j-1)/j,average_rewards)+np.multiply(1/j,rewards)
plt.plot(average_rewards, label='greedy Q(a)=5')
question_two()
def question_three():
average_rewards = [0]*1000
epsilon = 0.1
for j in range(1,2001):
bandits, averages = create_bandits()
average_estimate, number_bandit_choices = initial_estimate(10, 0) #Q(a)=0
rewards = []
for i in range(1000):
reward, number_bandit_choices, average_estimate = epsilon_greedy(average_estimate, bandits, number_bandit_choices, epsilon)
rewards.append(reward)
average_rewards = np.multiply((j-1)/j,average_rewards)+np.multiply(1/j,rewards)
plt.plot(average_rewards, label='epsilon greedy 0.1')
question_three()
def question_four():
average_rewards = [0]*1000
epsilon = 0.01
for j in range(1,2001):
bandits, averages = create_bandits()
average_estimate, number_bandit_choices = initial_estimate(10, 0) #Q(a)=0
rewards = []
for i in range(1000):
reward, number_bandit_choices, average_estimate = epsilon_greedy(average_estimate, bandits, number_bandit_choices, epsilon)
rewards.append(reward)
average_rewards = np.multiply((j-1)/j,average_rewards)+np.multiply(1/j,rewards)
plt.plot(average_rewards, label='epsilon greedy 0.01')
question_four()
def question_five(c=np.sqrt(2)):
average_rewards = [0]*1000
for j in range(1,2001):
bandits, averages = create_bandits()
average_estimate, number_bandit_choices = initial_estimate(10, 0) #will be 5 here
potentials = [float('inf')]*len(bandits)
rewards = []
for i in range(1000):
reward, number_bandit_choices, average_estimate, potentials = UCB(average_estimate, bandits, number_bandit_choices, c, potentials, i)
rewards.append(reward)
average_rewards = np.multiply((j-1)/j,average_rewards)+np.multiply(1/j,rewards)
plt.plot(average_rewards, label='ucb c=sqrt(2)')
question_five()
plt.xlabel('number of trial')
plt.ylabel('averaged reward')
plt.title('Comparison of the average reward in respect to the policy')
plt.legend()
fig.savefig('ex1'+'.png')
def plot(average_rewards, label, title, filename):
fig = plt.figure()
ax = plt.subplot(111)
ax.xticks = np.arange(average_rewards[0])
for i in range(average_rewards):
ax.plot(average_rewards[i], label = label[i])
plt.xlabel('number of trial')
plt.ylabel('average reward')
plt.title(title)
ax.legend()
plt.legend()
fig.savefig(filename+'.png')