-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAlignment.py
246 lines (188 loc) · 10.1 KB
/
Alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 20 09:36:52 2017
@author: rohit
"""
import tensorflow as tf
import tensorflow.contrib.slim as slim
import ubmatrix
import numpy as np
import time
sess = tf.InteractiveSession()
first_peak = []
def finished(s):
"""
determines if the crystal is in a diffracting position for the 100 equivalences
"""
s = [s[0], s[1]]
x = ubmatrix.motor_pos([1,0,0]) #list of motor positions for 100 equivalences
if (s in x) :
print("--------------------DONE--------------------", x[x.index(s)])
global first_peak
first_peak = x[x.index(s)]
return True
return False
def finished2(s):
"""
determines if the crystal is in a different diffracting position for the 100 equivalences
NOT YET IMPLEMENTED
"""
s = [s[0], s[1]]
x = ubmatrix.motor_pos([1,0,0])
if (s in x) and not (x[x.index(s)] == first_peak or x[x.index(s)] == np.negative(first_peak)):
print("--------------------DONE--------------------", x[x.index(s)])
return True
return False
def action(a):
"""
Returns the action desired by index 'a'
the action list could contain negative movements as well but for now it is only positive.
"""
return [[1,0],[0,1],[1,1]][a]
def reward(s):
"""
rewards the agent if the crystal is in diffracting postion
reward == 1 if hit, 0 otherwise
"""
if finished(s):
return 1
return 0
def discounted_rewards(r):
"""
increases the reward for moves that result in a reflection based on their proximity to the refleciton.
moves right before a reflection is hit gets a larger reward than moves
"""
discounted_r = np.zeros_like(r)
running_add = 0
for t in (range(0, np.size(r))):
running_add = running_add*0.99 + r[t]
discounted_r[t] = running_add
return discounted_r
class agent():
"""
Machine learning agent. uses the motor positions and previous experiences to make the next move. After every reflection is hit, the agent
updastes its network to speed up the reflection finding process. Uses the Adam Optimizer provided by tensorflow.
"""
def __init__(self, lr, s_size, a_size, h_size):
self.state_in = tf.placeholder(shape = [None, s_size], dtype=tf.float32, name = "STATE_IN")
hidden = slim.fully_connected(self.state_in, h_size, biases_initializer=None, activation_fn=tf.nn.relu)
self.output = slim.fully_connected(hidden, a_size, biases_initializer=None, activation_fn = tf.nn.softmax)
self.reward_holder = tf.placeholder(shape=[None], dtype=tf.float32, name = "REWARD_HOLDER")
self.action_holder = tf.placeholder(shape=[None], dtype=tf.int32, name= "ACTION_HOLDER")
self.indexes = tf.range(0, tf.shape(self.output)[0]) *tf.shape(self.output)[1] + self.action_holder
self.responsible_outputs= tf.gather(tf.reshape(self.output, [-1]), self.indexes) #the probabilities that were chosen
self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder[-1]) # the mean of the discounted reward times ln(responsible outputs)
tvars = tf.trainable_variables() # The hidden net and the output layer
self.gradient_holders = []
for idx, var in enumerate(tvars): #idx = index, var = layer
placeholder = tf.placeholder(tf.float32, name=str(idx)+'_holder')
self.gradient_holders.append(placeholder)
self.gradients = tf.gradients(self.loss, tvars)
optimizer = tf.train.AdamOptimizer(learning_rate=lr) #the method of optimizing we use
self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders, tvars))
tf.reset_default_graph()
merged = tf.summary.merge_all()
myAgent = agent(lr = 0.0001, s_size = 2, a_size = 3, h_size = 400) # learning rate, state size, action size, hidden node size
# iterations of episodes, max number of movements
total_episodes = 5000
max_ep = 100000
update_freqency = 1
# initialize the agent
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
i=0
s=[0,0]
total_reward = []
total_length = []
total = []
gradBuffer = sess.run(tf.trainable_variables())
for ix, grad in enumerate(gradBuffer):
gradBuffer[ix] = grad*0
while i < total_episodes: #begin training agent
running_reward = 0
ep_history = []
for j in range(max_ep):
s = [s[0]%360, s[1]%360]
a_dist = sess.run(myAgent.output, feed_dict = {myAgent.state_in:[s]}) #action distribution from agent
a = np.random.choice(a_dist[0], p = a_dist[0])
l1 = []
for n in range(len(a_dist[0])):
if (a+0.01 > a_dist[0][n] > a-0.01):
l1.append(n)
a = np.random.choice(l1) #chooses action based on a_dist values
if np.random.rand() <= 0.01:
a = np.random.randint(3) #move randomly every now and then
a_v = action(a)
s1 = np.add(s, a_v) #udpate the state based on the aciton chosen
r = reward(s1)*1000/(1+j) #determind the reward of the state
if j>=max_ep-1: #if the agent has not hit a reflection in max_ep moves, it has failed miserably
print("Failure")
total.append(max_ep)
r=-10
s1 = [s1[0]%360, s1[1]%360] #ensure that the state has not gone past 360 degrees (loops over)
ep_history.append([s, a, r, s1]) #appends the state-reward-action-newstate list to the history
s = s1 #update the state
running_reward+=r
if j%10000 == 0: #prints attributes every 10,000 moves
print("a_dist: ", a_dist, "a: ", a, " L1: ", l1, s, i, r)
if finished(s):
"""
if a reflection is hit, the agent needs to attempt to learn why, and how to do so again
"""
ep_history = np.array(ep_history)
r = (max_ep - j)/10000
np.append(ep_history[2], r)
total.append(j)
print();print();print();print();print(s, "Success, ", j);print();print();print();print();
ep_history = np.array(ep_history)
ep_history[:,2] = discounted_rewards(ep_history[:, 2])
print(ep_history[2])
feed_dict = {myAgent.reward_holder:ep_history[:,2], myAgent.action_holder:ep_history[:,1], myAgent.state_in:np.vstack(ep_history[:,0])}
grads=sess.run(myAgent.gradients, feed_dict=feed_dict)
for idx, grad in enumerate(grads):
gradBuffer[idx] += grad
if i%update_freqency == 0 and i!=0:
feed_dict = dictionary = dict(zip(myAgent.gradient_holders, gradBuffer)) #updates the agents weights and therefor, outputs
_ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
for ix, grad in enumerate(gradBuffer):
gradBuffer[ix] = grad*0
total_reward.append(running_reward)
print("a_dist: ", a_dist, "a: ", a, " L1: ", l1, s, i, r)
break
#----------------------------------------------------------- portion for finding the second reflection, not yet finished ---------------------------------------------------
# if finished2(s):
# print("ROUND 2 FINISHED, A SECOND PEAK IS HIT")
# print(s);print();print();print()
# ep_history = np.array(ep_history)
# r = (max_ep - j)/10000
# print(ep_history[:,2])
# np.append(ep_history[2], r)
# print(ep_history[:,2])
# total.append(j)
# print();print();print();print();print(s, "Success, ", j);print();print();print();print();
## time.sleep(2)
# print(ep_history[:,2], r)
# ep_history = np.array(ep_history)
# ep_history[:,2] = discounted_rewards(ep_history[:, 2])
# print(ep_history[2])
# feed_dict = {myAgent.reward_holder:ep_history[:,2], myAgent.action_holder:ep_history[:,1], myAgent.state_in:np.vstack(ep_history[:,0])}
# grads=sess.run(myAgent.gradients, feed_dict=feed_dict)
# for idx, grad in enumerate(grads):
# gradBuffer[idx] += grad
#
# if i%update_freqency == 0 and i!=0:
# feed_dict = dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
# _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
# for ix, grad in enumerate(gradBuffer):
# gradBuffer[ix] = grad*0
# s = [0,0]
# break #out of for loop
## ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
if i%100 == 0:
print(sess.run(myAgent.loss, feed_dict={myAgent.reward_holder:ep_history[:,2], myAgent.action_holder:ep_history[:,1], myAgent.state_in:np.vstack(ep_history[:,0])}), 'LOSS')
print(np.mean(total[-100:]), "a")
if i%100==0 and i>0 and j==0:
print(total, i, "k")
i=i+1