-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
298 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import xgboost as xgb | ||
import time | ||
import numpy as np | ||
|
||
class xgb_tuning(): | ||
def __init__(self,obj,features,target,nthread=4): | ||
self.model_name = 'xgboost' | ||
self.metrics = {'aucpr'} | ||
self.obj = obj | ||
self.X = features | ||
self.y = target | ||
self.nthread = nthread | ||
self.num_boost = [] | ||
|
||
def xgb_train_eval(self,max_depth,learning_rate, | ||
gamma, reg_alpha,reg_lambda,n_estimators=1000, | ||
kfold = True,verbose = -1): | ||
|
||
X = self.X | ||
y = self.y | ||
|
||
L1 = time.time() | ||
md= int(max_depth) | ||
lr= max(learning_rate,0) | ||
ne= int(n_estimators) | ||
gamma= max(gamma, 0) # 0 | ||
ra= max(reg_alpha, 0) # 0 | ||
rl= max(reg_lambda, 0) # 1 | ||
see= 123 | ||
|
||
N_FOLDS = 4 | ||
STOP_ROUNDS = 10 | ||
|
||
dtrain = xgb.DMatrix(X, label=y, nthread=self.nthread) | ||
|
||
param_hyp = {'max_depth' : md, | ||
'eta' : lr, # Learning Rate | ||
'gamma' : gamma, | ||
# 'num_parallel_tree' : ne, | ||
'alpha' : ra, | ||
'lambda' : rl | ||
} | ||
|
||
if kfold : | ||
cv_results = xgb.cv( | ||
params = param_hyp, | ||
dtrain = dtrain, | ||
num_boost_round=ne, # n_estimators | ||
seed=see, | ||
nfold=N_FOLDS, | ||
metrics=self.metrics, | ||
early_stopping_rounds=STOP_ROUNDS, | ||
obj = self.obj | ||
) | ||
|
||
print('Done in ',round((time.time()-L1)/60,2),'minutes') | ||
print(len(cv_results['test-rmse-mean'])) | ||
|
||
metric_result = -1*cv_results['test-rmse-mean'].min() | ||
self.num_boost.append([len(cv_results['test-rmse-mean']),metric_result]) | ||
|
||
return metric_result | ||
|
||
else: | ||
self.opt_index = np.argmax(np.array(self.num_boost)[:,1]) | ||
self.ne_train = self.num_boost[self.opt_index][0] | ||
|
||
xgB_model = xgb.train(params = param_hyp, | ||
dtrain = dtrain, | ||
num_boost_round=self.ne_train, # n_estimators | ||
# seed=see, | ||
# metrics=self.metrics, | ||
# early_stopping_rounds=STOP_ROUNDS, | ||
obj = self.obj) | ||
return xgB_model | ||
|
||
# class lgb_tuning(): | ||
# def __init__(self) : | ||
# self.num_boost = [] | ||
|
||
# def lgb_evaluate(self,max_depth = 5 | ||
# ,learning_rate = 0.002 | ||
# ,reg_alpha = 0.1 | ||
# ,reg_lambda = 0.1 | ||
# ,colsample_bytree = 0.95 | ||
# ,bagging_fraction = 0.95 | ||
# ,num_leaves = 10 | ||
# ,min_data = 5 | ||
# ,max_bin = 50 | ||
# ,bagging_freq = 15 | ||
# ,num_boost_round = 8500 | ||
# ,X=X_tr2,y=y_tr | ||
# ,kfold = True,verbose = -1): | ||
|
||
# # ra= max(reg_alpha, 0) # 0 | ||
# # rl= max(reg_lambda, 0) # 1 | ||
|
||
# # Parameters | ||
# N_FOLDS = 4 | ||
# MAX_BOOST_ROUNDS = int(num_boost_round) ## --> n_estimators | ||
# LEARNING_RATE = max(learning_rate,0) | ||
|
||
# params = {} | ||
# params['learning_rate'] = LEARNING_RATE # shrinkage_rate | ||
# params['boosting_type'] = 'gbdt' | ||
# params['objective'] = 'binary' | ||
# params['metric'] = ['auc','average_precision'] | ||
# params['scale_pos_weight'] = 8 | ||
|
||
# params['sub_feature'] = max(colsample_bytree,0.3) # feature_fraction | ||
# params['reg_alpha'] = reg_alpha | ||
# params['reg_lambda'] = reg_lambda | ||
# params['max_depth'] = int(max_depth) | ||
# params['bagging_fraction'] = bagging_fraction # sub_row --> same as 'subsample' | ||
# params['bagging_freq'] = int(bagging_freq) | ||
# params['num_leaves'] = int(num_leaves) # num_leaf --> same as 'max_leaves' | ||
# params['min_data'] = int(min_data) # the larger the more regulate | ||
# params['max_bin'] = int(max_bin) ##small number deal with overfit | ||
# params['min_hessian'] = 0.3 # min_sum_hessian_in_leaf | ||
|
||
# params['verbose'] = verbose | ||
# params['n_jobs'] = 25 | ||
|
||
# d_train = lgb.Dataset(X, label=y, | ||
# # categorical_feature=col_cat, | ||
# free_raw_data=False) | ||
|
||
# if kfold : | ||
# cv_results = lgb.cv(params, d_train, num_boost_round=MAX_BOOST_ROUNDS, nfold=N_FOLDS, | ||
# verbose_eval=0,early_stopping_rounds=8) | ||
# metric_result = cv_results['average_precision-mean'][-1] | ||
# self.num_boost.append([len(cv_results['auc-mean']),metric_result]) | ||
# print(len(cv_results['auc-mean'])) | ||
# return metric_result | ||
|
||
# else: | ||
# lgB_model = lgb.train(params, d_train, num_boost_round = MAX_BOOST_ROUNDS, verbose_eval = 250,early_stopping_rounds=5 | ||
# ,valid_sets=[d_train]) | ||
# return lgB_model | ||
|
||
# def opt_numb_boost(self): | ||
# self.opt_index = np.argmax(np.array(self.num_boost)[:,1]) | ||
# return lgb_tune.num_boost[self.opt_index][0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
|
||
from sklearn.metrics import mean_squared_error, make_scorer,r2_score, classification_report, confusion_matrix | ||
from sklearn.metrics import precision_recall_curve, roc_auc_score, plot_precision_recall_curve, average_precision_score | ||
import seaborn as sns | ||
|
||
def binary_eval(y_true, y_pred = None, model = None, predictor = None): | ||
""" | ||
[TODO] To output evaluation of binary model | ||
Args: | ||
predictor (pandas): | ||
model (string) : column name for target value | ||
predictor (array of string) : name of features to be calculated | ||
Returns: | ||
auc_pr, auc_roc | ||
""" | ||
|
||
y_val = y_true.copy() | ||
|
||
if model is not None : | ||
if predictor is not None : | ||
y_val_pred = model.predict(predictor) | ||
else: | ||
print('Insert Data for Model') | ||
return | ||
elif y_pred is not None : | ||
y_val_pred = y_pred.copy() | ||
else: | ||
print('Insert Model or Target Prediction') | ||
return | ||
|
||
y_val_pred2 = y_val_pred.reshape(-1) | ||
y_val_label = (y_val_pred2>0.5).astype(int) | ||
|
||
print(classification_report(y_val,y_val_label)) | ||
|
||
cm = confusion_matrix(y_val,y_val_label) | ||
sns.heatmap((cm.transpose()/cm.sum(axis = 1)).transpose(), annot = True) | ||
|
||
auc_pr_val = round(average_precision_score(y_val,y_val_pred2),4) | ||
auc_roc_val = round(roc_auc_score(y_val,y_val_pred2),4) | ||
|
||
print('aucpr : ',auc_pr_val) | ||
print('aucroc : ',auc_roc_val) | ||
|
||
return auc_pr_val, auc_roc_val |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import pandas as pd | ||
from sklearn.metrics import roc_auc_score | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import seaborn as sns | ||
|
||
def eval_features(df_input,col_label='y',col_features_selected=['1','2']): | ||
""" | ||
[TODO] To output GINI score and PSI scores for each feature | ||
Args: | ||
df_input (pandas): | ||
col_label (string) : column name for target value | ||
col_features_selected (array of string) : name of features to be calculated | ||
Returns: | ||
feat_summary | ||
""" | ||
|
||
# Variables initiation | ||
d = dict() # for storing GINI values | ||
p = dict() # for storing PSI scores | ||
|
||
for c in col_features_selected: | ||
x = df_input[c] | ||
y = df_input[col_label] | ||
idx = ~x.isnull() | ||
x_ = x[idx] | ||
y_ = y[idx] | ||
|
||
# Store the GINI score | ||
try: | ||
d[c]=2*roc_auc_score(y_,x_)-1 | ||
except: | ||
print(c) | ||
d[c]=2*roc_auc_score(y_,x_)-1 | ||
|
||
yval=y.unique() | ||
df_features1=df_input[[c]] | ||
df_1=df_input[df_input[col_label]==yval[0]][[c]] | ||
df_2=df_input[df_input[col_label]==yval[1]][[c]] | ||
|
||
# [TODO] Binning to calculate the PSI | ||
n=10 | ||
bins=[] | ||
range_step=100/n | ||
steps=0 | ||
while steps+range_step<100: | ||
try: | ||
bins.append((np.percentile(df_features1,steps),np.percentile(df_features1,steps+range_step))) | ||
steps=steps+range_step | ||
except: | ||
print(c) | ||
bins.append((np.percentile(df_features1,steps),np.percentile(df_features1,steps+range_step))) | ||
steps=steps+range_step | ||
|
||
|
||
df1_probs=[] | ||
df2_probs=[] | ||
|
||
# [TODO] Compute probability for each bin to calculate PSI | ||
for bin0 in bins: | ||
df1_probs.append(len(df_1[(df_1[c]>=bin0[0])&(df_1[c]<bin0[1])])/ | ||
float(len(df_1))) | ||
df2_probs.append(len(df_2[(df_2[c]>=bin0[0])&(df_2[c]<bin0[1])])/ | ||
float(len(df_2))) | ||
|
||
df1_probs.append(len(df_1[(df_1[c]>=bins[-1][1])])/ | ||
float(len(df_1))) | ||
df2_probs.append(len(df_2[(df_2[c]>=bins[-1][1])])/ | ||
float(len(df_2))) | ||
|
||
|
||
df_prob_dist=pd.DataFrame({'prob1':df1_probs,'prob2':df2_probs}) | ||
|
||
# Calculate PSI values | ||
# df_prob_dist['karawang']=df_prob_dist['karawang'].replace(0,0.0000000001) | ||
df_prob_dist['psi_calc1']=(df_prob_dist['prob1']-df_prob_dist['prob2']) | ||
df_prob_dist['psi_calc2']=(np.log(df_prob_dist['prob1']/df_prob_dist['prob2'])).fillna(0).replace(np.inf,10) | ||
df_prob_dist['psi']=df_prob_dist['psi_calc2']*df_prob_dist['psi_calc1'] | ||
|
||
p[c]=df_prob_dist.psi.sum() | ||
|
||
|
||
|
||
gini_per_features = pd.DataFrame.from_dict(d,orient='index').rename(columns={0:'gini'}) | ||
gini_per_features['gini_abs'] = np.abs(gini_per_features['gini']) | ||
|
||
psi_per_features = pd.DataFrame.from_dict(p,orient='index').rename(columns={0:'psi_score'}) | ||
gini_per_features['psi_score']=psi_per_features['psi_score'] | ||
feat_summary=gini_per_features.reset_index() | ||
feat_summary.columns=['features','gini','gini_abs','psi_score'] | ||
|
||
return feat_summary | ||
|
||
def plot_compare(mydata,col_target,col_x): | ||
d1=mydata[mydata[col_target]==1][col_x] | ||
d2=mydata[mydata[col_target]==0][col_x] | ||
plt.figure(figsize=(10,10)) | ||
sns.distplot(d1,label='class 1') | ||
sns.distplot(d2,label='class 0') | ||
plt.legend(loc='upper left') |