all implementations and wrapping it up

ReyazBeigh · Jul 30, 2021 · 8e9f266 · 8e9f266
1 parent 5c060af
commit 8e9f266
Show file tree

Hide file tree

Showing 16 changed files with 55 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 - Loading csv data using Pandas, understanding data using describe and head methods of pandas 
 - Building first Model, Chooing Feaures (know data used for prediction) and Target (the value to predict)  Defining a model, Fitting a model, Predict, Evaulate (using sklearn lib. of python)
 - Validating the models, introdution to Mean Absolute Error (MEA- diff b/w the acutal and predicted values from the training data set), introduction to train_test_split - splitting the given data into training data and data set for prediction so that we can compare the results
-- Underfitting and overfitting concepts based on Shallow and Deep trees respectively. In underfitting we ignore a lot of features while in overfitting the issue is we have large splits that results in less number of records to predict 
+- Underfitting and overfitting concepts based on Shallow and Deep trees respectively. In underfitting we ignore a lot of features while in overfitting the issue is we have large splits that results in less number of records to predict [Lesser the tree depth the more we go to underfitting and more the depth we move towards overfitting. to overcome this issue we need to find a middle way!] 
 - Introduction to Random Forest (RandomForestRegressor) to overcomes the problem of underfitting and overfitting resulting in better model selection. Provided by sklearn lib. itself 
 
 ### While Choosing a better model can be quite a task, There are Google and AWS services that offer AutoML that decide the better models automatically and we just use them for predection pretty straight forward. 

diff --git a/__pycache__/a1_load_data.cpython-39.pyc b/__pycache__/a1_load_data.cpython-39.pyc
diff --git a/__pycache__/a2_define_model.cpython-39.pyc b/__pycache__/a2_define_model.cpython-39.pyc
diff --git a/__pycache__/a3_validation.cpython-39.pyc b/__pycache__/a3_validation.cpython-39.pyc
diff --git a/__pycache__/a4_split_test_data.cpython-39.pyc b/__pycache__/a4_split_test_data.cpython-39.pyc
diff --git a/__pycache__/a5_under_over_fit.cpython-39.pyc b/__pycache__/a5_under_over_fit.cpython-39.pyc
diff --git a/__pycache__/a6_random_forest.cpython-39.pyc b/__pycache__/a6_random_forest.cpython-39.pyc
diff --git a/load_data_1.py → a1_load_data.py b/load_data_1.py → a1_load_data.py
diff --git a/define_model_2.py → a2_define_model.py b/define_model_2.py → a2_define_model.py
@@ -1,12 +1,12 @@
-import load_data_1
+import a1_load_data
 
 from sklearn.tree import DecisionTreeRegressor
 
-y = load_data_1.melb_data.Price
+y = a1_load_data.melb_data.Price
 
 features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
 
-X = load_data_1.melb_data[features]
+X = a1_load_data.melb_data[features]
 
 mlb_model = DecisionTreeRegressor(random_state=0)
 

diff --git a/validation_3.py → a3_validation.py b/validation_3.py → a3_validation.py
@@ -1,10 +1,10 @@
 from sklearn.metrics import mean_absolute_error
-import define_model_2 as mlb_model_cals
+import a2_define_model as mlb_model_cals
 
 prediction_on_all_data = mlb_model_cals.mlb_model.predict(mlb_model_cals.X)
 
 mean_absolute_error = mean_absolute_error(mlb_model_cals.y, prediction_on_all_data)
 
-print("Mean Absolute Error: " + str(mean_absolute_error))
+print("Mean Absolute Error, validated on the training data and the price values that we already have: " + str(mean_absolute_error))
 
 print(__file__ + " DONE ")
diff --git a/split_test_data_4.py → a4_split_test_data.py b/split_test_data_4.py → a4_split_test_data.py
@@ -1,4 +1,4 @@
-import define_model_2 as model_def
+import a2_define_model as model_def
 from sklearn.model_selection import train_test_split
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.metrics import mean_absolute_error
@@ -11,7 +11,7 @@
 
 real_prediction = mlb_model.predict(test_X)
 
-print("MEA known data to see how good is our model")
+print("MEA after splitting our data into training and testing sets")
 
 
 print(mean_absolute_error(test_y,real_prediction))

diff --git a/a5_under_over_fit.py b/a5_under_over_fit.py
@@ -0,0 +1,33 @@
+
+import a4_split_test_data as splitted_data
+from sklearn.metrics import mean_absolute_error
+from sklearn.tree import DecisionTreeRegressor
+
+def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
+    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
+    model.fit(train_X, train_y)
+    preds_val = model.predict(val_X)
+    mae = mean_absolute_error(val_y, preds_val)
+    return(mae)
+
+#find the best value for max_leaf_nodes
+possible_best_nodes = [5, 10, 20, 30, 50, 100, 200,500,1000,2000,5000]
+
+optimal_node = possible_best_nodes[0]
+
+temp_mea = None
+
+for leaf in possible_best_nodes:
+        mae = get_mae(leaf, splitted_data.train_X, splitted_data.test_X, splitted_data.train_y, splitted_data.test_y)
+        if temp_mea == None or temp_mea > mae:
+            temp_mea = mae
+            optimal_node = leaf
+
+optimal_model = DecisionTreeRegressor(max_leaf_nodes=optimal_node, random_state=0)
+optimal_model.fit(splitted_data.train_X, splitted_data.train_y)
+preds_val = optimal_model.predict(splitted_data.test_X)
+mae = mean_absolute_error(splitted_data.test_y, preds_val)
+
+print("MEA after OPTIMISATION [in the middle of underfitting and overfitting]-> " + str(mae))
+
+print(__file__+" DONE")
diff --git a/a6_random_forest.py b/a6_random_forest.py
@@ -0,0 +1,9 @@
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_absolute_error
+import a4_split_test_data as splitted_data
+
+forest_model = RandomForestRegressor(random_state=0)
+forest_model.fit(splitted_data.train_X, splitted_data.train_y)
+prediction = forest_model.predict(splitted_data.test_X)
+mea = mean_absolute_error(splitted_data.test_y, prediction)
+print( " MEA from Random Forest Model, should be better from other 2 approaches "+str(mea))
diff --git a/a7_testing_it_all.py b/a7_testing_it_all.py
@@ -0,0 +1,5 @@
+import a2_define_model
+import a3_validation
+import a4_split_test_data
+import a5_under_over_fit
+import a6_random_forest
diff --git a/random_forest.py b/random_forest.py
diff --git a/under_over_fit.py b/under_over_fit.py