Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

explain_weights in Pipelines: minimal version #177

Merged
merged 5 commits into from
May 2, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/autodocs/eli5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ The following functions are exposed to a top level, e.g.
.. autofunction:: eli5.show_weights

.. autofunction:: eli5.show_prediction

.. autofunction:: eli5.transform_feature_names
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not certain this belongs here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I was also unsure; do you think this function will be used stand-alone, not as an implementation detail of how to make explain.. / show.. functions work?

17 changes: 17 additions & 0 deletions docs/source/libraries/sklearn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,23 @@ is independent.
.. _ExtraTreesClassifier: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
.. _ExtraTreesRegressor: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html#sklearn.ensemble.ExtraTreesRegressor

.. _sklearn-pipelines:

Transformation pipelines
------------------------

:func:`eli5.explain_weights` can be applied to a scikit-learn Pipeline_ as
long as:

* ``explain_weights`` is supported for the final step of the Pipeline
* :func:`eli5.transform_feature_names` is supported for all preceding steps
of the Pipeline. singledispatch_ can be used to register
``transform_feature_names`` for transformer classes not handled (yet) by ELI5
or to override the default implementation.

.. _Pipeline: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
.. _singledispatch: https://pypi.python.org/pypi/singledispatch

Reversing hashing trick
-----------------------

Expand Down
1 change: 1 addition & 0 deletions eli5/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .formatters import format_as_html, format_html_styles, format_as_text
from .explain import explain_weights, explain_prediction
from .sklearn import explain_weights_sklearn, explain_prediction_sklearn
from .transform import transform_feature_names


try:
Expand Down
1 change: 1 addition & 0 deletions eli5/sklearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@
FeatureUnhasher,
invert_hashing_and_fit,
)
from . import transform as _
16 changes: 16 additions & 0 deletions eli5/sklearn/explain_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np # type: ignore

from sklearn.base import BaseEstimator, RegressorMixin # type: ignore
from sklearn.pipeline import Pipeline # type: ignore
from sklearn.linear_model import ( # type: ignore
ElasticNet, # includes Lasso, MultiTaskElasticNet, etc.
ElasticNetCV,
Expand Down Expand Up @@ -61,6 +62,7 @@
get_default_target_names,
)
from eli5.explain import explain_weights
from eli5.transform import transform_feature_names
from eli5._feature_importances import (
get_feature_importances_filtered,
get_feature_importance_explanation,
Expand Down Expand Up @@ -422,3 +424,17 @@ def _features(target_id):
method='linear model',
is_regression=True,
)


@register(Pipeline)
def explain_weights_pipeline(estimator, feature_names=None, **kwargs):
last_estimator = estimator.steps[-1][1]
transform_pipeline = Pipeline(estimator.steps[:-1])
if 'vec' in kwargs:
feature_names = get_feature_names(feature_names, vec=kwargs.pop('vec'))
feature_names = transform_feature_names(transform_pipeline, feature_names)
out = explain_weights(last_estimator,
feature_names=feature_names,
**kwargs)
out.estimator = repr(estimator)
return out
30 changes: 30 additions & 0 deletions eli5/sklearn/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""transform_feature_names implementations for scikit-learn transformers
"""

import numpy as np # type: ignore
from sklearn.pipeline import Pipeline # type: ignore
from sklearn.feature_selection.base import SelectorMixin # type: ignore

from eli5.transform import transform_feature_names
from eli5.sklearn.utils import get_feature_names as _get_feature_names


# Feature selection:

@transform_feature_names.register(SelectorMixin)
def _select_names(est, in_names=None):
mask = est.get_support(indices=False)
in_names = _get_feature_names(est, feature_names=in_names,
num_features=len(mask))
return [in_names[i] for i in np.flatnonzero(mask)]


# Pipelines

@transform_feature_names.register(Pipeline)
def _pipeline_names(est, in_names=None):
names = in_names
for name, trans in est.steps:
if trans is not None:
names = transform_feature_names(trans, names)
return names
29 changes: 29 additions & 0 deletions eli5/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Handling transformation pipelines in explanations"""

from singledispatch import singledispatch


@singledispatch
def transform_feature_names(transformer, in_names=None):
"""Get feature names for transformer output as a function of input names

Used by :func:`explain_weights` when applied to a scikit-learn Pipeline,
this ``singledispatch`` should be registered with custom name
transformations for each class of transformer.

Parameters
----------
transform : scikit-learn-compatible transformer
in_names : list of str, optional
Names for features input to transformer.transform().
If not provided, the implementation may generate default feature names
if the number of input features is known.

Returns
-------
feature_names : list of str
"""
if hasattr(transformer, 'get_feature_names'):
return transformer.get_feature_names()
raise NotImplementedError('transform_feature_names not available for '
'{}'.format(transformer))
31 changes: 30 additions & 1 deletion tests/test_sklearn_explain_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@
AdaBoostRegressor,
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.base import BaseEstimator
from sklearn.base import BaseEstimator, clone
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.multiclass import OneVsRestClassifier
import pytest

Expand Down Expand Up @@ -484,3 +486,30 @@ def test_feature_importances_no_remaining(clf):
for expl in format_as_all(res, clf):
assert 'more features' not in expl and 'more …' not in expl
assert 'x1' not in expl # it has zero importance


@pytest.mark.parametrize(['transformer', 'X', 'feature_names',
'explain_kwargs'], [
[None, [[1, 0], [0, 1]], ['hello', 'world'], {}],
[None, [[1, 0], [0, 1]], None,
{'vec': CountVectorizer().fit(['hello', 'world'])}],
[CountVectorizer(), ['hello', 'world'], None, {'top': 1}],
[CountVectorizer(), ['hello', 'world'], None, {'top': 2}],
[make_pipeline(CountVectorizer(),
SelectKBest(lambda X, y: np.array([3, 2, 1]), k=2)),
['hello', 'world zzzignored'], None, {}],
])
@pytest.mark.parametrize(['predictor'], [
[LogisticRegression()],
[LinearSVR()],
])
def test_explain_pipeline(predictor, transformer, X, feature_names,
explain_kwargs):
y = [1, 0]
expected = explain_weights(clone(predictor).fit([[1, 0], [0, 1]], y),
feature_names=['hello', 'world'],
**explain_kwargs)
pipe = make_pipeline(transformer, clone(predictor)).fit(X, y)
actual = explain_weights(pipe, feature_names=feature_names,
**explain_kwargs)
assert expected._repr_html_() == actual._repr_html_()