-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #101 from albalu/master
add TreeBasedFeatureReduction + tests
- Loading branch information
Showing
3 changed files
with
359 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
from matbench.utils.utils import setup_custom_logger, MatbenchError | ||
from sklearn.base import is_classifier | ||
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, \ | ||
GradientBoostingClassifier, GradientBoostingRegressor | ||
from sklearn.model_selection import check_cv | ||
|
||
class TreeBasedFeatureReduction(object): | ||
""" | ||
Tree-based feature reduction tools based on sklearn models that have | ||
the .feature_importances_ attribute. | ||
Args: | ||
mode (str): "regression" or "classification" | ||
importance_percentile (float): the selected percentile of the features | ||
sorted (descending) based on their importance. | ||
random_state (int): relevant if non-deterministic algorithms such as | ||
random forest are used. | ||
""" | ||
def __init__(self, mode, importance_percentile=0.95, loglevel=None, | ||
logpath='.', random_state=0): | ||
self.mode = mode | ||
self.logger = setup_custom_logger(filepath=logpath, level=loglevel) | ||
self.importance_percentile = importance_percentile | ||
self.selected_features = None | ||
self.rs = random_state | ||
|
||
def get_top_features(self, feat_importance): | ||
""" | ||
Simple function to through a sorted list of features and select top | ||
percentiles. | ||
Args: | ||
feat_importance ([(str, float)]): a sorted list of | ||
(feature, importance) tuples | ||
Returns ([str]): list of the top * percentile of features. * determined | ||
by importance_percentile argument. | ||
""" | ||
selected_feats = [] | ||
frac = 0.0 | ||
for feat in feat_importance: | ||
selected_feats.append(feat[0]) | ||
frac += feat[1] | ||
if frac >= self.importance_percentile: | ||
break | ||
return selected_feats | ||
|
||
def get_reduced_features(self, tree_model, X, y, recursive=True): | ||
""" | ||
Gives a reduced list of feature names given a tree-based model that | ||
has the .feature_importances_ attribute. | ||
Args: | ||
tree_model (instantiated sklearn tree-based model): | ||
X (pandas.dataframe): | ||
y (pandas.Series or numpy.ndarray): the target column | ||
recursive (bool): | ||
Returns ([str]): list of the top * percentile of features. * determined | ||
by importance_percentile argument. | ||
""" | ||
m_curr = 0 # current number of top/important features | ||
m_prev = len(X.columns) | ||
while m_curr < m_prev: | ||
tree_model.fit(X, y) | ||
fimportance = sorted(zip(X.columns, tree_model.feature_importances_), | ||
key=lambda x: x[1], reverse=True) | ||
tfeats = self.get_top_features(fimportance) | ||
m_curr = len(tfeats) | ||
m_prev = len(X.columns) | ||
self.logger.debug( | ||
'nfeatures: {}->{}'.format(len(X.columns), m_curr)) | ||
X = X[tfeats] | ||
if not recursive: | ||
break | ||
return tfeats | ||
|
||
def fit(self, X, y, tree='rf', recursive=True, cv=5): | ||
""" | ||
Fits to the data (X) and target (y) to determine the selected_features. | ||
Args: | ||
X (pandas.DataFrame): input data, note that numpy matrix is NOT | ||
accepted since the X.columns is used for feature names | ||
y (pandas.Series or np.ndarray): list of outputs used for fitting | ||
the tree model | ||
tree (str or instantiated sklearn tree-based model): if a model is | ||
directly fed, it must have the .feature_importances_ attribute | ||
recursive (bool): whether to recursively reduce the features (True) | ||
or just do it once (False) | ||
Returns (None): | ||
sets the class attribute .selected_features | ||
""" | ||
m0 = len(X.columns) | ||
if isinstance(tree, str): | ||
if tree.lower() in ['rf', 'random forest', 'randomforest']: | ||
if self.mode.lower() in ['classification', 'classifier']: | ||
tree = RandomForestClassifier(random_state=self.rs) | ||
else: | ||
tree = RandomForestRegressor(random_state=self.rs) | ||
elif tree.lower() in ['gb', 'gbt', 'gradiet boosting']: | ||
if self.mode.lower() in ['classification', 'classifier']: | ||
tree = GradientBoostingClassifier(random_state=self.rs) | ||
else: | ||
tree = GradientBoostingRegressor(random_state=self.rs) | ||
else: | ||
raise MatbenchError('Unsupported tree_type {}!'.format(tree)) | ||
|
||
cv = check_cv(cv=cv, y=y, classifier=is_classifier(tree)) | ||
all_feats = [] | ||
for train, test in cv.split(X, y, groups=None): | ||
Xtrn = X.iloc[train] | ||
ytrn = y[train] | ||
all_feats += self.get_reduced_features(tree, Xtrn, ytrn, recursive) | ||
# take the union of selected features of each fold | ||
self.selected_features = list(set(all_feats)) | ||
self.logger.info('Finished tree-based feature reduction of {} intial ' | ||
'features to {}'.format(m0, len(self.selected_features))) | ||
|
||
def transform(self, X, y=None): | ||
""" | ||
Transforms the data with the subset of features determined after | ||
calling the fit method on the data. | ||
Args: | ||
X (pandas.DataFrame): input data, note that numpy matrix is NOT | ||
accepted since the X.columns is used for feature names | ||
y (placeholder): ignored input (for consistency in notation) | ||
Returns (pandas.DataFrame): the data with reduced number of features. | ||
""" | ||
if self.selected_features is None: | ||
raise MatbenchError('The fit method should be called first!') | ||
return X[self.selected_features] |
Oops, something went wrong.