Skip to content

Commit

Permalink
[Alam] add function
Browse files Browse the repository at this point in the history
  • Loading branch information
alamhanz committed Feb 1, 2023
1 parent d4188a0 commit 7bcb972
Show file tree
Hide file tree
Showing 4 changed files with 298 additions and 0 deletions.
4 changes: 4 additions & 0 deletions {{cookiecutter.repo_name}}/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ virtualenv {{cookiecutter.venv_name}}
pip install -r requirements.txt
jupyter lab
```
additional command to add kernel to jupyter
```
ipython kernel install --name [env-name] --user
```

## Structure

Expand Down
143 changes: 143 additions & 0 deletions {{cookiecutter.repo_name}}/src/model_tuner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import xgboost as xgb
import time
import numpy as np

class xgb_tuning():
def __init__(self,obj,features,target,nthread=4):
self.model_name = 'xgboost'
self.metrics = {'aucpr'}
self.obj = obj
self.X = features
self.y = target
self.nthread = nthread
self.num_boost = []

def xgb_train_eval(self,max_depth,learning_rate,
gamma, reg_alpha,reg_lambda,n_estimators=1000,
kfold = True,verbose = -1):

X = self.X
y = self.y

L1 = time.time()
md= int(max_depth)
lr= max(learning_rate,0)
ne= int(n_estimators)
gamma= max(gamma, 0) # 0
ra= max(reg_alpha, 0) # 0
rl= max(reg_lambda, 0) # 1
see= 123

N_FOLDS = 4
STOP_ROUNDS = 10

dtrain = xgb.DMatrix(X, label=y, nthread=self.nthread)

param_hyp = {'max_depth' : md,
'eta' : lr, # Learning Rate
'gamma' : gamma,
# 'num_parallel_tree' : ne,
'alpha' : ra,
'lambda' : rl
}

if kfold :
cv_results = xgb.cv(
params = param_hyp,
dtrain = dtrain,
num_boost_round=ne, # n_estimators
seed=see,
nfold=N_FOLDS,
metrics=self.metrics,
early_stopping_rounds=STOP_ROUNDS,
obj = self.obj
)

print('Done in ',round((time.time()-L1)/60,2),'minutes')
print(len(cv_results['test-rmse-mean']))

metric_result = -1*cv_results['test-rmse-mean'].min()
self.num_boost.append([len(cv_results['test-rmse-mean']),metric_result])

return metric_result

else:
self.opt_index = np.argmax(np.array(self.num_boost)[:,1])
self.ne_train = self.num_boost[self.opt_index][0]

xgB_model = xgb.train(params = param_hyp,
dtrain = dtrain,
num_boost_round=self.ne_train, # n_estimators
# seed=see,
# metrics=self.metrics,
# early_stopping_rounds=STOP_ROUNDS,
obj = self.obj)
return xgB_model

# class lgb_tuning():
# def __init__(self) :
# self.num_boost = []

# def lgb_evaluate(self,max_depth = 5
# ,learning_rate = 0.002
# ,reg_alpha = 0.1
# ,reg_lambda = 0.1
# ,colsample_bytree = 0.95
# ,bagging_fraction = 0.95
# ,num_leaves = 10
# ,min_data = 5
# ,max_bin = 50
# ,bagging_freq = 15
# ,num_boost_round = 8500
# ,X=X_tr2,y=y_tr
# ,kfold = True,verbose = -1):

# # ra= max(reg_alpha, 0) # 0
# # rl= max(reg_lambda, 0) # 1

# # Parameters
# N_FOLDS = 4
# MAX_BOOST_ROUNDS = int(num_boost_round) ## --> n_estimators
# LEARNING_RATE = max(learning_rate,0)

# params = {}
# params['learning_rate'] = LEARNING_RATE # shrinkage_rate
# params['boosting_type'] = 'gbdt'
# params['objective'] = 'binary'
# params['metric'] = ['auc','average_precision']
# params['scale_pos_weight'] = 8

# params['sub_feature'] = max(colsample_bytree,0.3) # feature_fraction
# params['reg_alpha'] = reg_alpha
# params['reg_lambda'] = reg_lambda
# params['max_depth'] = int(max_depth)
# params['bagging_fraction'] = bagging_fraction # sub_row --> same as 'subsample'
# params['bagging_freq'] = int(bagging_freq)
# params['num_leaves'] = int(num_leaves) # num_leaf --> same as 'max_leaves'
# params['min_data'] = int(min_data) # the larger the more regulate
# params['max_bin'] = int(max_bin) ##small number deal with overfit
# params['min_hessian'] = 0.3 # min_sum_hessian_in_leaf

# params['verbose'] = verbose
# params['n_jobs'] = 25

# d_train = lgb.Dataset(X, label=y,
# # categorical_feature=col_cat,
# free_raw_data=False)

# if kfold :
# cv_results = lgb.cv(params, d_train, num_boost_round=MAX_BOOST_ROUNDS, nfold=N_FOLDS,
# verbose_eval=0,early_stopping_rounds=8)
# metric_result = cv_results['average_precision-mean'][-1]
# self.num_boost.append([len(cv_results['auc-mean']),metric_result])
# print(len(cv_results['auc-mean']))
# return metric_result

# else:
# lgB_model = lgb.train(params, d_train, num_boost_round = MAX_BOOST_ROUNDS, verbose_eval = 250,early_stopping_rounds=5
# ,valid_sets=[d_train])
# return lgB_model

# def opt_numb_boost(self):
# self.opt_index = np.argmax(np.array(self.num_boost)[:,1])
# return lgb_tune.num_boost[self.opt_index][0]
48 changes: 48 additions & 0 deletions {{cookiecutter.repo_name}}/src/modeleval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@

from sklearn.metrics import mean_squared_error, make_scorer,r2_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve, roc_auc_score, plot_precision_recall_curve, average_precision_score
import seaborn as sns

def binary_eval(y_true, y_pred = None, model = None, predictor = None):
"""
[TODO] To output evaluation of binary model
Args:
predictor (pandas):
model (string) : column name for target value
predictor (array of string) : name of features to be calculated
Returns:
auc_pr, auc_roc
"""

y_val = y_true.copy()

if model is not None :
if predictor is not None :
y_val_pred = model.predict(predictor)
else:
print('Insert Data for Model')
return
elif y_pred is not None :
y_val_pred = y_pred.copy()
else:
print('Insert Model or Target Prediction')
return

y_val_pred2 = y_val_pred.reshape(-1)
y_val_label = (y_val_pred2>0.5).astype(int)

print(classification_report(y_val,y_val_label))

cm = confusion_matrix(y_val,y_val_label)
sns.heatmap((cm.transpose()/cm.sum(axis = 1)).transpose(), annot = True)

auc_pr_val = round(average_precision_score(y_val,y_val_pred2),4)
auc_roc_val = round(roc_auc_score(y_val,y_val_pred2),4)

print('aucpr : ',auc_pr_val)
print('aucroc : ',auc_roc_val)

return auc_pr_val, auc_roc_val
103 changes: 103 additions & 0 deletions {{cookiecutter.repo_name}}/src/theexplorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import pandas as pd
from sklearn.metrics import roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def eval_features(df_input,col_label='y',col_features_selected=['1','2']):
"""
[TODO] To output GINI score and PSI scores for each feature
Args:
df_input (pandas):
col_label (string) : column name for target value
col_features_selected (array of string) : name of features to be calculated
Returns:
feat_summary
"""

# Variables initiation
d = dict() # for storing GINI values
p = dict() # for storing PSI scores

for c in col_features_selected:
x = df_input[c]
y = df_input[col_label]
idx = ~x.isnull()
x_ = x[idx]
y_ = y[idx]

# Store the GINI score
try:
d[c]=2*roc_auc_score(y_,x_)-1
except:
print(c)
d[c]=2*roc_auc_score(y_,x_)-1

yval=y.unique()
df_features1=df_input[[c]]
df_1=df_input[df_input[col_label]==yval[0]][[c]]
df_2=df_input[df_input[col_label]==yval[1]][[c]]

# [TODO] Binning to calculate the PSI
n=10
bins=[]
range_step=100/n
steps=0
while steps+range_step<100:
try:
bins.append((np.percentile(df_features1,steps),np.percentile(df_features1,steps+range_step)))
steps=steps+range_step
except:
print(c)
bins.append((np.percentile(df_features1,steps),np.percentile(df_features1,steps+range_step)))
steps=steps+range_step


df1_probs=[]
df2_probs=[]

# [TODO] Compute probability for each bin to calculate PSI
for bin0 in bins:
df1_probs.append(len(df_1[(df_1[c]>=bin0[0])&(df_1[c]<bin0[1])])/
float(len(df_1)))
df2_probs.append(len(df_2[(df_2[c]>=bin0[0])&(df_2[c]<bin0[1])])/
float(len(df_2)))

df1_probs.append(len(df_1[(df_1[c]>=bins[-1][1])])/
float(len(df_1)))
df2_probs.append(len(df_2[(df_2[c]>=bins[-1][1])])/
float(len(df_2)))


df_prob_dist=pd.DataFrame({'prob1':df1_probs,'prob2':df2_probs})

# Calculate PSI values
# df_prob_dist['karawang']=df_prob_dist['karawang'].replace(0,0.0000000001)
df_prob_dist['psi_calc1']=(df_prob_dist['prob1']-df_prob_dist['prob2'])
df_prob_dist['psi_calc2']=(np.log(df_prob_dist['prob1']/df_prob_dist['prob2'])).fillna(0).replace(np.inf,10)
df_prob_dist['psi']=df_prob_dist['psi_calc2']*df_prob_dist['psi_calc1']

p[c]=df_prob_dist.psi.sum()



gini_per_features = pd.DataFrame.from_dict(d,orient='index').rename(columns={0:'gini'})
gini_per_features['gini_abs'] = np.abs(gini_per_features['gini'])

psi_per_features = pd.DataFrame.from_dict(p,orient='index').rename(columns={0:'psi_score'})
gini_per_features['psi_score']=psi_per_features['psi_score']
feat_summary=gini_per_features.reset_index()
feat_summary.columns=['features','gini','gini_abs','psi_score']

return feat_summary

def plot_compare(mydata,col_target,col_x):
d1=mydata[mydata[col_target]==1][col_x]
d2=mydata[mydata[col_target]==0][col_x]
plt.figure(figsize=(10,10))
sns.distplot(d1,label='class 1')
sns.distplot(d2,label='class 0')
plt.legend(loc='upper left')

0 comments on commit 7bcb972

Please sign in to comment.