Skip to content

Commit

Permalink
Data Normalization & Weight avarage over examples
Browse files Browse the repository at this point in the history
Signed-off-by: AmrElsersy <amrelsersay@gmail.com>
  • Loading branch information
AmrElsersy committed Jan 7, 2021
1 parent baf3420 commit adac598
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 25 deletions.
34 changes: 27 additions & 7 deletions nn/Linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,44 @@ def init_weights(self,indim, outdim):
self.weights['b'] = np.zeros((outdim, 1))

def forward(self,X):
# output dims = (output_layer x features) . (features x batch_size) = (output_layer x batch_size)
# print(self.weights['b'].shape)
output = np.dot(self.weights['w'].T ,X) + self.weights['b']
self.cache['x'] = X
self.cache['output'] = output

return output

def backward(self,global_grad):
"""
compute backward probagation: multiply the global gradient with the local gradient with respect to each input (W,b,X)
args:
global_grad: global gradient of the next layer(represent dL/dX_of_next_layer)
dims: (output_nuorons_of_next_layer, batch_size)
return:
global gradient to backprobagate to the prev layer
dims: (output_nuorons_of_current_layer, batch_size)
"""
batch_size = global_grad.shape[1]


dX = np.dot(self.local_grads['x'], global_grad )
# print("local grad x")
# print(self.local_grads['x'].shape)

# ========= dW dim ==========
# dW dims = W dims .. because we have to calculate w = w - lr * dW
# note that dW is global gradient .... but the local gradient (dY/dw) has a different dims as it is a function of the input
dW = np.dot(np.array(self.local_grads['w']) , global_grad.T )
# same as dW above
db = np.sum(global_grad, axis = 1, keepdims = True)
# dW(x_features, output) = dw_local(x_features, batch) * global.T(batch, output)

# ========= / batch_size .. avarage over examples =========
# devide by batch size because avarage is calculated due to matrix multiplication of the batch raw in dw_local & batch column in global_grad.T
# so we need to devide because the matrix mul is a sum
dW = np.dot(np.array(self.local_grads['w']) , global_grad.T ) / batch_size
db = np.sum(global_grad, axis = 1, keepdims = True) / batch_size

self.weights_global_grads = {'w': dW, 'b': db}

# =============== PRINT ====================
# print("global=",global_grad.shape, " ..dX=",dX.shape, " .. dW_glbal=",dW.shape," .. dW_local=",np.array(self.local_grads['w']).shape)

# return the global gradient with respect to the input(the output of the prev layer)
return dX

def calculate_local_grads(self, X):
Expand Down
36 changes: 22 additions & 14 deletions nn/MNIST_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
from optim import GradientDecent, SGD, Adam
from activations import ReLU,Sigmoid
from loss import CrossEntropyLoss
import time

# MNIST Dataset
dataset = Dataset("train.csv")
dataloader = Data_Loader(dataset, 32)
dataloader = Data_Loader(dataset, 1)


model = Model()
Expand All @@ -19,16 +20,23 @@

model.set_loss(CrossEntropyLoss())

optimizer = GradientDecent(model.parameters(), learning_rate = 0.001)

i = 0
for image, label in dataloader:
# if i == 1700:
# break
i = i + 1
print("Iteration no.", i)
predicted = model(image)
loss = model.loss(predicted, label)
model.backward()
optimizer.step()
print("loss= ", loss)
optimizer = GradientDecent(model.parameters(), learning_rate = 0.01)


epochs = 1
for epoch in range(epochs):
i = 0
for image, label in dataloader:
# if i == 1700:
# break
image = image/255
i = i + 1
print("Iteration no.", i)
predicted = model(image)
loss = model.loss(predicted, label)
model.backward()
optimizer.step()
print("loss= ", loss)
# time.sleep(0.1)
print("===========")

3 changes: 2 additions & 1 deletion nn/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ def forward(self, Y_hat, Y):
crossentropy_loss = max(np.mean(log_probs), 0) # avrage on both axis 0 & axis 1 ()
# crossentropy_loss = np.sum(crossentropy_loss, axis=1, keepdims=True)
#print("Dims", probs.shape)

print('Label =',Y)
print('Prediction = ',np.argmax(probs,axis=0))
print('Pred = ',np.argmax(probs,axis=0))

# caching for backprop
self.cache['probs'] = probs
Expand Down
3 changes: 2 additions & 1 deletion nn/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def backward(self):
global_grad = self.loss_function.backward()
for layer in reversed(self.layers):
global_grad = layer.backward(global_grad)
# print(global_grad.shape)

def __call__(self, x):
return self.forward(x)
Expand All @@ -44,5 +45,5 @@ def train_mode(self):
self.is_train_mode = True
def eval_mode(self):
self.is_train_mode = False
def parameters(self):
def parameters(self):
return [layer for layer in self.layers if isinstance(layer, Layer)]
2 changes: 0 additions & 2 deletions nn/optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ def optimize(self, w, dw):

class GradientDecent(Optimizer):
def optimize(self, w, dw):
# dw = np.mean(dw, axis=1, keepdims=True)
dw = dw / np.max(dw)
w = w - self.lr * dw
return w

Expand Down

0 comments on commit adac598

Please sign in to comment.