From adac598d78598ccce136350956fe9a5c2e8f134d Mon Sep 17 00:00:00 2001 From: AmrElsersy Date: Thu, 7 Jan 2021 21:31:00 +0200 Subject: [PATCH] Data Normalization & Weight avarage over examples Signed-off-by: AmrElsersy --- nn/Linear.py | 34 +++++++++++++++++++++++++++------- nn/MNIST_test.py | 36 ++++++++++++++++++++++-------------- nn/loss.py | 3 ++- nn/model.py | 3 ++- nn/optim.py | 2 -- 5 files changed, 53 insertions(+), 25 deletions(-) diff --git a/nn/Linear.py b/nn/Linear.py index 259b3b2..7fb9cf1 100644 --- a/nn/Linear.py +++ b/nn/Linear.py @@ -20,8 +20,6 @@ def init_weights(self,indim, outdim): self.weights['b'] = np.zeros((outdim, 1)) def forward(self,X): - # output dims = (output_layer x features) . (features x batch_size) = (output_layer x batch_size) - # print(self.weights['b'].shape) output = np.dot(self.weights['w'].T ,X) + self.weights['b'] self.cache['x'] = X self.cache['output'] = output @@ -29,15 +27,37 @@ def forward(self,X): return output def backward(self,global_grad): + """ + compute backward probagation: multiply the global gradient with the local gradient with respect to each input (W,b,X) + args: + global_grad: global gradient of the next layer(represent dL/dX_of_next_layer) + dims: (output_nuorons_of_next_layer, batch_size) + return: + global gradient to backprobagate to the prev layer + dims: (output_nuorons_of_current_layer, batch_size) + """ + batch_size = global_grad.shape[1] + + dX = np.dot(self.local_grads['x'], global_grad ) - # print("local grad x") - # print(self.local_grads['x'].shape) + + # ========= dW dim ========== # dW dims = W dims .. because we have to calculate w = w - lr * dW # note that dW is global gradient .... but the local gradient (dY/dw) has a different dims as it is a function of the input - dW = np.dot(np.array(self.local_grads['w']) , global_grad.T ) - # same as dW above - db = np.sum(global_grad, axis = 1, keepdims = True) + # dW(x_features, output) = dw_local(x_features, batch) * global.T(batch, output) + + # ========= / batch_size .. avarage over examples ========= + # devide by batch size because avarage is calculated due to matrix multiplication of the batch raw in dw_local & batch column in global_grad.T + # so we need to devide because the matrix mul is a sum + dW = np.dot(np.array(self.local_grads['w']) , global_grad.T ) / batch_size + db = np.sum(global_grad, axis = 1, keepdims = True) / batch_size + self.weights_global_grads = {'w': dW, 'b': db} + + # =============== PRINT ==================== + # print("global=",global_grad.shape, " ..dX=",dX.shape, " .. dW_glbal=",dW.shape," .. dW_local=",np.array(self.local_grads['w']).shape) + + # return the global gradient with respect to the input(the output of the prev layer) return dX def calculate_local_grads(self, X): diff --git a/nn/MNIST_test.py b/nn/MNIST_test.py index c51b0a6..5876801 100644 --- a/nn/MNIST_test.py +++ b/nn/MNIST_test.py @@ -4,10 +4,11 @@ from optim import GradientDecent, SGD, Adam from activations import ReLU,Sigmoid from loss import CrossEntropyLoss +import time # MNIST Dataset dataset = Dataset("train.csv") -dataloader = Data_Loader(dataset, 32) +dataloader = Data_Loader(dataset, 1) model = Model() @@ -19,16 +20,23 @@ model.set_loss(CrossEntropyLoss()) -optimizer = GradientDecent(model.parameters(), learning_rate = 0.001) - -i = 0 -for image, label in dataloader: - # if i == 1700: - # break - i = i + 1 - print("Iteration no.", i) - predicted = model(image) - loss = model.loss(predicted, label) - model.backward() - optimizer.step() - print("loss= ", loss) +optimizer = GradientDecent(model.parameters(), learning_rate = 0.01) + + +epochs = 1 +for epoch in range(epochs): + i = 0 + for image, label in dataloader: + # if i == 1700: + # break + image = image/255 + i = i + 1 + print("Iteration no.", i) + predicted = model(image) + loss = model.loss(predicted, label) + model.backward() + optimizer.step() + print("loss= ", loss) + # time.sleep(0.1) + print("===========") + diff --git a/nn/loss.py b/nn/loss.py index beb70f4..42009f5 100644 --- a/nn/loss.py +++ b/nn/loss.py @@ -50,8 +50,9 @@ def forward(self, Y_hat, Y): crossentropy_loss = max(np.mean(log_probs), 0) # avrage on both axis 0 & axis 1 () # crossentropy_loss = np.sum(crossentropy_loss, axis=1, keepdims=True) #print("Dims", probs.shape) + print('Label =',Y) - print('Prediction = ',np.argmax(probs,axis=0)) + print('Pred = ',np.argmax(probs,axis=0)) # caching for backprop self.cache['probs'] = probs diff --git a/nn/model.py b/nn/model.py index 523430d..6478593 100644 --- a/nn/model.py +++ b/nn/model.py @@ -36,6 +36,7 @@ def backward(self): global_grad = self.loss_function.backward() for layer in reversed(self.layers): global_grad = layer.backward(global_grad) + # print(global_grad.shape) def __call__(self, x): return self.forward(x) @@ -44,5 +45,5 @@ def train_mode(self): self.is_train_mode = True def eval_mode(self): self.is_train_mode = False - def parameters(self): + def parameters(self): return [layer for layer in self.layers if isinstance(layer, Layer)] diff --git a/nn/optim.py b/nn/optim.py index 84b3e94..0d2d55a 100644 --- a/nn/optim.py +++ b/nn/optim.py @@ -35,8 +35,6 @@ def optimize(self, w, dw): class GradientDecent(Optimizer): def optimize(self, w, dw): - # dw = np.mean(dw, axis=1, keepdims=True) - dw = dw / np.max(dw) w = w - self.lr * dw return w