-
Notifications
You must be signed in to change notification settings - Fork 1
/
mpkpts_feature_selection.py
130 lines (109 loc) · 4.12 KB
/
mpkpts_feature_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from src import constants
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import StratifiedKFold
import pickle
import os
import argparse
if __name__ == '__main__':
# Read arguments with argparse: n_jobs
parser = argparse.ArgumentParser()
parser.add_argument('--n_jobs', type=int, default=1,
help='Number of jobs for parallelization.')
parser.add_argument('--folder', type=str, required=True,
help='Folder containing the extracted features.')
args = parser.parse_args()
# Define seed for reproducibility
os.environ['PYTHONHASHSEED'] = str(constants.SEED)
np.random.seed(constants.SEED)
path_data = Path(args.folder)
# Read data
X = pd.read_pickle(path_data / 'X.pkl')
y = pd.read_pickle(path_data / 'y.pkl')
class_labels = {
0: 'SFH_No',
1: 'SFH_Yes',
}
# TODO decide to keep this or not
# Subset X and y to have balanced classes
# minority_class = y.value_counts().min()
# minority_class_value = y.value_counts().idxmin()
# X = X[y == minority_class_value].append(X[y == (1-minority_class_value)].sample(n=minority_class, random_state=constants.SEED))
# y = y[y == minority_class_value].append(y[y == (1-minority_class_value)].sample(n=minority_class, random_state=constants.SEED))
# Compute class weights
class_weights = class_weight.compute_class_weight(
class_weight='balanced',
classes=np.unique(y),
y=y,
)
class_weights = dict(zip(np.unique(y), class_weights))
# Normalize data
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
# Instantiate feature selector
rf_selector = SelectFromModel(
RandomForestClassifier(
criterion='entropy',
max_depth=10,
n_estimators=250,
random_state=constants.SEED,
class_weight=class_weights
),
max_features=30
)
# Fit feature selector
print('Fitting RF selector...')
rf_selector.fit(X, y)
# Get selected features
selected_feature_indices_rf = rf_selector.get_support(indices=True)
selected_feature_names_rf = [X.columns[index] for index in selected_feature_indices_rf]
# Print selected features
# print(f'Selected {len(selected_feature_names_rf)} features:')
# print(selected_feature_names_rf)
selected_features = selected_feature_names_rf
# Create a classifier with somehow good parameters
# to perform stepwise feature selection
classifier = RandomForestClassifier(
n_estimators=400,
max_depth=None,
min_samples_split=5,
min_samples_leaf=2,
random_state=constants.SEED,
)
# Backward selection
print('Performing backward selection...')
sfs_backward = SequentialFeatureSelector(
classifier,
k_features=1,
forward=False,
floating=False,
scoring='roc_auc',
verbose=1,
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=constants.SEED),
n_jobs=args.n_jobs,
).fit(X[selected_features], y)
# Save the results
with open(str(path_data / 'sfs_backward.pkl'), 'wb') as handle:
pickle.dump(sfs_backward, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Forward selection
print('Performing forward selection...')
sfs_forward = SequentialFeatureSelector(
classifier,
k_features=30,
forward=True,
floating=False,
scoring='roc_auc',
verbose=1,
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=constants.SEED),
n_jobs=args.n_jobs,
).fit(X[selected_features], y)
# Save the results
with open(str(path_data / 'sfs_forward.pkl'), 'wb') as handle:
pickle.dump(sfs_forward, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Done.')