Data Normalization & Weight avarage over examples

Signed-off-by: AmrElsersy <amrelsersay@gmail.com>
AmrElsersy · Jan 7, 2021 · adac598 · adac598
1 parent baf3420
commit adac598
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 25 deletions.
diff --git a/nn/Linear.py b/nn/Linear.py
@@ -20,24 +20,44 @@ def init_weights(self,indim, outdim):
     self.weights['b'] = np.zeros((outdim, 1))
 
   def forward(self,X):
-    # output dims = (output_layer x features) . (features x batch_size) = (output_layer x batch_size)
-    # print(self.weights['b'].shape)
     output = np.dot(self.weights['w'].T ,X) + self.weights['b']
     self.cache['x'] = X
     self.cache['output'] = output
 
     return output
 
   def backward(self,global_grad):
+    """
+      compute backward probagation: multiply the global gradient with the local gradient with respect to each input (W,b,X)
+      args:
+        global_grad: global gradient of the next layer(represent dL/dX_of_next_layer) 
+          dims: (output_nuorons_of_next_layer, batch_size)
+      return:
+        global gradient to backprobagate to the prev layer
+          dims: (output_nuorons_of_current_layer, batch_size)
+    """
+    batch_size = global_grad.shape[1]
+
+
     dX = np.dot(self.local_grads['x'], global_grad )
-    # print("local grad x")
-    # print(self.local_grads['x'].shape)
+
+    # ========= dW dim ==========
     # dW dims = W dims .. because we have to calculate w = w - lr * dW
     # note that dW is global gradient .... but the local gradient (dY/dw) has a different dims as it is a function of the input
-    dW = np.dot(np.array(self.local_grads['w']) , global_grad.T )
-    # same as dW above
-    db = np.sum(global_grad, axis = 1, keepdims = True)
+    # dW(x_features, output) = dw_local(x_features, batch) * global.T(batch, output)
+
+    # ========= / batch_size .. avarage over examples =========
+    # devide by batch size because avarage is calculated due to matrix multiplication of the batch raw in dw_local & batch column in global_grad.T
+    # so we need to devide because the matrix mul is a sum
+    dW = np.dot(np.array(self.local_grads['w']) , global_grad.T ) / batch_size
+    db = np.sum(global_grad, axis = 1, keepdims = True) / batch_size
+
     self.weights_global_grads = {'w': dW, 'b': db}
+
+    # =============== PRINT ====================
+    # print("global=",global_grad.shape, " ..dX=",dX.shape, " .. dW_glbal=",dW.shape," .. dW_local=",np.array(self.local_grads['w']).shape)
+
+    # return the global gradient with respect to the input(the output of the prev layer)
     return dX
 
   def calculate_local_grads(self, X):

diff --git a/nn/MNIST_test.py b/nn/MNIST_test.py
@@ -4,10 +4,11 @@
 from optim import GradientDecent, SGD, Adam
 from activations import ReLU,Sigmoid
 from loss import CrossEntropyLoss
+import time
 
 # MNIST Dataset
 dataset = Dataset("train.csv")
-dataloader = Data_Loader(dataset, 32)
+dataloader = Data_Loader(dataset, 1)
 
 
 model = Model()
@@ -19,16 +20,23 @@
 
 model.set_loss(CrossEntropyLoss())
 
-optimizer = GradientDecent(model.parameters(), learning_rate = 0.001)
-
-i = 0
-for image, label in dataloader:
-    # if i == 1700:
-    #     break
-    i = i + 1
-    print("Iteration no.", i)
-    predicted = model(image)
-    loss = model.loss(predicted, label)
-    model.backward()
-    optimizer.step()
-    print("loss= ", loss)
+optimizer = GradientDecent(model.parameters(), learning_rate = 0.01)
+
+
+epochs = 1
+for epoch in range(epochs):
+    i = 0
+    for image, label in dataloader:
+        # if i == 1700:
+        #     break
+        image = image/255
+        i = i + 1
+        print("Iteration no.", i)
+        predicted = model(image)
+        loss = model.loss(predicted, label)
+        model.backward()
+        optimizer.step()
+        print("loss= ", loss)
+        # time.sleep(0.1)
+        print("===========")
+
diff --git a/nn/loss.py b/nn/loss.py
@@ -50,8 +50,9 @@ def forward(self, Y_hat, Y):
         crossentropy_loss = max(np.mean(log_probs), 0) # avrage on both axis 0 & axis 1 ()
         # crossentropy_loss = np.sum(crossentropy_loss, axis=1, keepdims=True)
         #print("Dims", probs.shape)
+
         print('Label =',Y)
-        print('Prediction = ',np.argmax(probs,axis=0))
+        print('Pred  = ',np.argmax(probs,axis=0))
 
         # caching for backprop
         self.cache['probs'] = probs

diff --git a/nn/model.py b/nn/model.py
@@ -36,6 +36,7 @@ def backward(self):
         global_grad = self.loss_function.backward()
         for layer in reversed(self.layers):
             global_grad = layer.backward(global_grad)
+            # print(global_grad.shape)
 
     def __call__(self, x):
         return self.forward(x)
@@ -44,5 +45,5 @@ def train_mode(self):
         self.is_train_mode = True
     def eval_mode(self):
         self.is_train_mode = False
-    def parameters(self):
+    def parameters(self):        
         return [layer for layer in self.layers if isinstance(layer, Layer)]
diff --git a/nn/optim.py b/nn/optim.py
@@ -35,8 +35,6 @@ def optimize(self, w, dw):
 
 class GradientDecent(Optimizer):
     def optimize(self, w, dw):
-        # dw = np.mean(dw, axis=1, keepdims=True)
-        dw = dw / np.max(dw)
         w = w - self.lr * dw
         return w