Skip to content

Commit

Permalink
Merge pull request #101 from albalu/master
Browse files Browse the repository at this point in the history
add TreeBasedFeatureReduction + tests
  • Loading branch information
albalu authored Oct 20, 2018
2 parents 5acc295 + 1ee3a12 commit e5142f4
Show file tree
Hide file tree
Showing 3 changed files with 359 additions and 4 deletions.
136 changes: 136 additions & 0 deletions matbench/preprocessing/feature_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from matbench.utils.utils import setup_custom_logger, MatbenchError
from sklearn.base import is_classifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, \
GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import check_cv

class TreeBasedFeatureReduction(object):
"""
Tree-based feature reduction tools based on sklearn models that have
the .feature_importances_ attribute.
Args:
mode (str): "regression" or "classification"
importance_percentile (float): the selected percentile of the features
sorted (descending) based on their importance.
random_state (int): relevant if non-deterministic algorithms such as
random forest are used.
"""
def __init__(self, mode, importance_percentile=0.95, loglevel=None,
logpath='.', random_state=0):
self.mode = mode
self.logger = setup_custom_logger(filepath=logpath, level=loglevel)
self.importance_percentile = importance_percentile
self.selected_features = None
self.rs = random_state

def get_top_features(self, feat_importance):
"""
Simple function to through a sorted list of features and select top
percentiles.
Args:
feat_importance ([(str, float)]): a sorted list of
(feature, importance) tuples
Returns ([str]): list of the top * percentile of features. * determined
by importance_percentile argument.
"""
selected_feats = []
frac = 0.0
for feat in feat_importance:
selected_feats.append(feat[0])
frac += feat[1]
if frac >= self.importance_percentile:
break
return selected_feats

def get_reduced_features(self, tree_model, X, y, recursive=True):
"""
Gives a reduced list of feature names given a tree-based model that
has the .feature_importances_ attribute.
Args:
tree_model (instantiated sklearn tree-based model):
X (pandas.dataframe):
y (pandas.Series or numpy.ndarray): the target column
recursive (bool):
Returns ([str]): list of the top * percentile of features. * determined
by importance_percentile argument.
"""
m_curr = 0 # current number of top/important features
m_prev = len(X.columns)
while m_curr < m_prev:
tree_model.fit(X, y)
fimportance = sorted(zip(X.columns, tree_model.feature_importances_),
key=lambda x: x[1], reverse=True)
tfeats = self.get_top_features(fimportance)
m_curr = len(tfeats)
m_prev = len(X.columns)
self.logger.debug(
'nfeatures: {}->{}'.format(len(X.columns), m_curr))
X = X[tfeats]
if not recursive:
break
return tfeats

def fit(self, X, y, tree='rf', recursive=True, cv=5):
"""
Fits to the data (X) and target (y) to determine the selected_features.
Args:
X (pandas.DataFrame): input data, note that numpy matrix is NOT
accepted since the X.columns is used for feature names
y (pandas.Series or np.ndarray): list of outputs used for fitting
the tree model
tree (str or instantiated sklearn tree-based model): if a model is
directly fed, it must have the .feature_importances_ attribute
recursive (bool): whether to recursively reduce the features (True)
or just do it once (False)
Returns (None):
sets the class attribute .selected_features
"""
m0 = len(X.columns)
if isinstance(tree, str):
if tree.lower() in ['rf', 'random forest', 'randomforest']:
if self.mode.lower() in ['classification', 'classifier']:
tree = RandomForestClassifier(random_state=self.rs)
else:
tree = RandomForestRegressor(random_state=self.rs)
elif tree.lower() in ['gb', 'gbt', 'gradiet boosting']:
if self.mode.lower() in ['classification', 'classifier']:
tree = GradientBoostingClassifier(random_state=self.rs)
else:
tree = GradientBoostingRegressor(random_state=self.rs)
else:
raise MatbenchError('Unsupported tree_type {}!'.format(tree))

cv = check_cv(cv=cv, y=y, classifier=is_classifier(tree))
all_feats = []
for train, test in cv.split(X, y, groups=None):
Xtrn = X.iloc[train]
ytrn = y[train]
all_feats += self.get_reduced_features(tree, Xtrn, ytrn, recursive)
# take the union of selected features of each fold
self.selected_features = list(set(all_feats))
self.logger.info('Finished tree-based feature reduction of {} intial '
'features to {}'.format(m0, len(self.selected_features)))

def transform(self, X, y=None):
"""
Transforms the data with the subset of features determined after
calling the fit method on the data.
Args:
X (pandas.DataFrame): input data, note that numpy matrix is NOT
accepted since the X.columns is used for feature names
y (placeholder): ignored input (for consistency in notation)
Returns (pandas.DataFrame): the data with reduced number of features.
"""
if self.selected_features is None:
raise MatbenchError('The fit method should be called first!')
return X[self.selected_features]
Loading

0 comments on commit e5142f4

Please sign in to comment.