[Alam] add function

alamhanz · Feb 1, 2023 · 7bcb972 · 7bcb972
1 parent d4188a0
commit 7bcb972
Show file tree

Hide file tree

Showing 4 changed files with 298 additions and 0 deletions.
diff --git a/{{cookiecutter.repo_name}}/README.md b/{{cookiecutter.repo_name}}/README.md
@@ -21,6 +21,10 @@ virtualenv {{cookiecutter.venv_name}}
 pip install -r requirements.txt
 jupyter lab
 ```
+additional command to add kernel to jupyter
+```
+ipython kernel install --name [env-name] --user
+```
 
 ## Structure
 

diff --git a/{{cookiecutter.repo_name}}/src/model_tuner.py b/{{cookiecutter.repo_name}}/src/model_tuner.py
@@ -0,0 +1,143 @@
+import xgboost as xgb
+import time
+import numpy as np
+
+class xgb_tuning():
+    def __init__(self,obj,features,target,nthread=4):
+        self.model_name = 'xgboost'
+        self.metrics = {'aucpr'}
+        self.obj = obj
+        self.X = features
+        self.y = target
+        self.nthread = nthread
+        self.num_boost = []
+
+    def xgb_train_eval(self,max_depth,learning_rate,
+                 gamma, reg_alpha,reg_lambda,n_estimators=1000,
+                 kfold = True,verbose = -1):
+
+        X = self.X
+        y = self.y
+
+        L1 = time.time()
+        md= int(max_depth)
+        lr= max(learning_rate,0)
+        ne= int(n_estimators)
+        gamma= max(gamma, 0) # 0
+        ra= max(reg_alpha, 0) # 0
+        rl= max(reg_lambda, 0) # 1
+        see= 123
+
+        N_FOLDS = 4
+        STOP_ROUNDS = 10
+
+        dtrain = xgb.DMatrix(X, label=y, nthread=self.nthread)
+
+        param_hyp = {'max_depth' : md,
+                 'eta' : lr,  # Learning Rate
+                 'gamma' : gamma,
+                 # 'num_parallel_tree' : ne,
+                 'alpha' : ra,
+                 'lambda' : rl
+                 }
+
+        if kfold :
+            cv_results = xgb.cv(
+                    params = param_hyp,
+                    dtrain = dtrain,
+                    num_boost_round=ne, # n_estimators
+                    seed=see,
+                    nfold=N_FOLDS,
+                    metrics=self.metrics,
+                    early_stopping_rounds=STOP_ROUNDS,
+                    obj = self.obj
+                    )
+
+            print('Done in ',round((time.time()-L1)/60,2),'minutes')
+            print(len(cv_results['test-rmse-mean']))
+
+            metric_result = -1*cv_results['test-rmse-mean'].min()
+            self.num_boost.append([len(cv_results['test-rmse-mean']),metric_result])
+
+            return metric_result
+
+        else:
+            self.opt_index = np.argmax(np.array(self.num_boost)[:,1])
+            self.ne_train = self.num_boost[self.opt_index][0]
+
+            xgB_model = xgb.train(params = param_hyp,
+                    dtrain = dtrain,
+                    num_boost_round=self.ne_train, # n_estimators
+                    # seed=see,
+                    # metrics=self.metrics,
+                    # early_stopping_rounds=STOP_ROUNDS,
+                    obj = self.obj)
+            return xgB_model
+
+# class lgb_tuning():
+#     def __init__(self) :
+#         self.num_boost = []
+
+#     def lgb_evaluate(self,max_depth = 5
+#                      ,learning_rate = 0.002
+#                      ,reg_alpha = 0.1
+#                      ,reg_lambda = 0.1
+#                      ,colsample_bytree = 0.95
+#                      ,bagging_fraction = 0.95
+#                      ,num_leaves = 10
+#                      ,min_data = 5
+#                      ,max_bin = 50
+#                      ,bagging_freq = 15
+#                      ,num_boost_round = 8500
+#                      ,X=X_tr2,y=y_tr
+#                      ,kfold = True,verbose = -1):
+
+#     #     ra= max(reg_alpha, 0) # 0
+#     #     rl= max(reg_lambda, 0) # 1
+
+#         # Parameters
+#         N_FOLDS = 4
+#         MAX_BOOST_ROUNDS = int(num_boost_round) ## --> n_estimators
+#         LEARNING_RATE = max(learning_rate,0)
+
+#         params = {}
+#         params['learning_rate'] = LEARNING_RATE # shrinkage_rate
+#         params['boosting_type'] = 'gbdt'
+#         params['objective'] = 'binary'
+#         params['metric'] = ['auc','average_precision']
+#         params['scale_pos_weight'] = 8
+
+#         params['sub_feature'] = max(colsample_bytree,0.3)      # feature_fraction 
+#         params['reg_alpha'] = reg_alpha
+#         params['reg_lambda'] = reg_lambda
+#         params['max_depth'] = int(max_depth)
+#         params['bagging_fraction'] = bagging_fraction # sub_row --> same as 'subsample'
+#         params['bagging_freq'] = int(bagging_freq)
+#         params['num_leaves'] = int(num_leaves)        # num_leaf --> same as 'max_leaves'
+#         params['min_data'] = int(min_data)         # the larger the more regulate
+#         params['max_bin'] = int(max_bin) ##small number deal with overfit
+#         params['min_hessian'] = 0.3     # min_sum_hessian_in_leaf
+
+#         params['verbose'] = verbose
+#         params['n_jobs'] = 25
+
+#         d_train = lgb.Dataset(X, label=y, 
+# #                               categorical_feature=col_cat,
+#                               free_raw_data=False)
+
+#         if kfold :
+#             cv_results = lgb.cv(params, d_train, num_boost_round=MAX_BOOST_ROUNDS, nfold=N_FOLDS, 
+#                             verbose_eval=0,early_stopping_rounds=8)
+#             metric_result = cv_results['average_precision-mean'][-1]
+#             self.num_boost.append([len(cv_results['auc-mean']),metric_result])
+#             print(len(cv_results['auc-mean']))
+#             return metric_result
+
+#         else:
+#             lgB_model = lgb.train(params, d_train, num_boost_round = MAX_BOOST_ROUNDS, verbose_eval = 250,early_stopping_rounds=5
+#                          ,valid_sets=[d_train])
+#             return lgB_model
+
+#     def opt_numb_boost(self):
+#         self.opt_index = np.argmax(np.array(self.num_boost)[:,1])
+#         return lgb_tune.num_boost[self.opt_index][0]
diff --git a/{{cookiecutter.repo_name}}/src/modeleval.py b/{{cookiecutter.repo_name}}/src/modeleval.py
@@ -0,0 +1,48 @@
+
+from sklearn.metrics import mean_squared_error, make_scorer,r2_score, classification_report, confusion_matrix
+from sklearn.metrics import precision_recall_curve, roc_auc_score, plot_precision_recall_curve, average_precision_score
+import seaborn as sns
+
+def binary_eval(y_true, y_pred = None, model = None, predictor = None):
+    """
+    [TODO] To output evaluation of binary model
+    
+    Args:
+        predictor (pandas): 
+        model (string) : column name for target value
+        predictor (array of string) : name of features to be calculated
+        
+    Returns:
+        auc_pr, auc_roc
+    
+    """
+
+    y_val = y_true.copy()
+
+    if model is not None :
+        if predictor is not None :
+            y_val_pred = model.predict(predictor)
+        else:
+            print('Insert Data for Model')
+            return
+    elif y_pred is not None :
+        y_val_pred = y_pred.copy()
+    else:
+        print('Insert Model or Target Prediction')   
+        return 
+
+    y_val_pred2 = y_val_pred.reshape(-1)
+    y_val_label = (y_val_pred2>0.5).astype(int)
+
+    print(classification_report(y_val,y_val_label))
+
+    cm = confusion_matrix(y_val,y_val_label)
+    sns.heatmap((cm.transpose()/cm.sum(axis = 1)).transpose(), annot = True)
+
+    auc_pr_val = round(average_precision_score(y_val,y_val_pred2),4)
+    auc_roc_val = round(roc_auc_score(y_val,y_val_pred2),4)
+
+    print('aucpr : ',auc_pr_val)
+    print('aucroc : ',auc_roc_val)
+
+    return auc_pr_val, auc_roc_val
diff --git a/{{cookiecutter.repo_name}}/src/theexplorer.py b/{{cookiecutter.repo_name}}/src/theexplorer.py
@@ -0,0 +1,103 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+def eval_features(df_input,col_label='y',col_features_selected=['1','2']):
+    """
+    [TODO] To output GINI score and PSI scores for each feature
+    
+    Args:
+        df_input (pandas): 
+        col_label (string) : column name for target value
+        col_features_selected (array of string) : name of features to be calculated
+        
+    Returns:
+        feat_summary
+        
+    """
+
+    # Variables initiation
+    d = dict() # for storing GINI values
+    p = dict() # for storing PSI scores
+
+    for c in col_features_selected: 
+        x = df_input[c]
+        y = df_input[col_label]
+        idx = ~x.isnull()
+        x_ = x[idx]
+        y_ = y[idx]
+
+        # Store the GINI score
+        try:
+            d[c]=2*roc_auc_score(y_,x_)-1
+        except:
+            print(c)
+            d[c]=2*roc_auc_score(y_,x_)-1
+
+        yval=y.unique()
+        df_features1=df_input[[c]]
+        df_1=df_input[df_input[col_label]==yval[0]][[c]]
+        df_2=df_input[df_input[col_label]==yval[1]][[c]]
+
+        # [TODO] Binning to calculate the PSI
+        n=10
+        bins=[]
+        range_step=100/n
+        steps=0
+        while steps+range_step<100:
+            try:
+                bins.append((np.percentile(df_features1,steps),np.percentile(df_features1,steps+range_step)))
+                steps=steps+range_step
+            except:
+                print(c)
+                bins.append((np.percentile(df_features1,steps),np.percentile(df_features1,steps+range_step)))
+                steps=steps+range_step
+
+
+        df1_probs=[]
+        df2_probs=[]
+
+        # [TODO] Compute probability for each bin to calculate PSI
+        for bin0 in bins:
+            df1_probs.append(len(df_1[(df_1[c]>=bin0[0])&(df_1[c]<bin0[1])])/
+                                float(len(df_1)))
+            df2_probs.append(len(df_2[(df_2[c]>=bin0[0])&(df_2[c]<bin0[1])])/
+                                float(len(df_2)))
+
+        df1_probs.append(len(df_1[(df_1[c]>=bins[-1][1])])/
+                            float(len(df_1)))
+        df2_probs.append(len(df_2[(df_2[c]>=bins[-1][1])])/
+                            float(len(df_2)))
+
+
+        df_prob_dist=pd.DataFrame({'prob1':df1_probs,'prob2':df2_probs})
+
+        # Calculate PSI values
+    #     df_prob_dist['karawang']=df_prob_dist['karawang'].replace(0,0.0000000001)
+        df_prob_dist['psi_calc1']=(df_prob_dist['prob1']-df_prob_dist['prob2'])
+        df_prob_dist['psi_calc2']=(np.log(df_prob_dist['prob1']/df_prob_dist['prob2'])).fillna(0).replace(np.inf,10)
+        df_prob_dist['psi']=df_prob_dist['psi_calc2']*df_prob_dist['psi_calc1']
+
+        p[c]=df_prob_dist.psi.sum()
+
+
+
+    gini_per_features = pd.DataFrame.from_dict(d,orient='index').rename(columns={0:'gini'})
+    gini_per_features['gini_abs'] = np.abs(gini_per_features['gini'])
+
+    psi_per_features = pd.DataFrame.from_dict(p,orient='index').rename(columns={0:'psi_score'})
+    gini_per_features['psi_score']=psi_per_features['psi_score']
+    feat_summary=gini_per_features.reset_index()
+    feat_summary.columns=['features','gini','gini_abs','psi_score']
+
+    return feat_summary
+
+def plot_compare(mydata,col_target,col_x):
+    d1=mydata[mydata[col_target]==1][col_x]
+    d2=mydata[mydata[col_target]==0][col_x]
+    plt.figure(figsize=(10,10))
+    sns.distplot(d1,label='class 1')
+    sns.distplot(d2,label='class 0')
+    plt.legend(loc='upper left')