Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support hashing vectorizer inside a union #176

Merged
merged 13 commits into from
May 2, 2017
11 changes: 11 additions & 0 deletions docs/source/libraries/sklearn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,17 @@ automatically; to handle HashingVectorizer_ and FeatureHasher_ for
# and ``ivec`` can be used as a vectorizer for eli5.explain_weights:
eli5.explain_weights(clf, vec=ivec)

HashingVectorizer_ is also supported inside a FeatureUnion_:
:func:`eli5.explain_prediction` handles this case automatically, and for
:func:`eli5.explain_weights` you can use :func:`eli5.sklearn.invert_hashing_and_fit`
(it works for plain HashingVectorizer_ too) - it tears FeatureUnion_ apart,
inverts and fits all hashing vectorizers and returns a new FeatureUnion_::

from eli5.sklearn import invert_hashing_and_fit

ivec = invert_hashing_and_fit(vec, X_sample)
eli5.explain_weights(clf, vec=ivec)

.. _FeatureHasher: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html#sklearn.feature_extraction.FeatureHasher

.. _sklearn-text-highlighting:
Expand Down
6 changes: 5 additions & 1 deletion eli5/sklearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@
explain_prediction_linear_classifier,
explain_prediction_linear_regressor,
)
from .unhashing import InvertableHashingVectorizer, FeatureUnhasher
from .unhashing import (
InvertableHashingVectorizer,
FeatureUnhasher,
invert_hashing_and_fit,
)
95 changes: 89 additions & 6 deletions eli5/sklearn/unhashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
from __future__ import absolute_import
from collections import defaultdict, Counter
from itertools import chain
from typing import List, Iterable, Any, Dict
from typing import List, Iterable, Any, Dict, Tuple, Union

import numpy as np # type: ignore
import six
from sklearn.base import BaseEstimator, TransformerMixin # type: ignore
from sklearn.feature_extraction.text import ( # type: ignore
HashingVectorizer,
FeatureHasher,
)
from sklearn.pipeline import FeatureUnion # type: ignore

from eli5._feature_names import FeatureNames

Expand Down Expand Up @@ -161,7 +163,11 @@ def recalculate_attributes(self, force=False):
"""
if not self._attributes_dirty and not force:
return
terms = np.array([term for term, _ in self._term_counts.most_common()])
terms = [term for term, _ in self._term_counts.most_common()]
if six.PY2:
terms = np.array(terms, dtype=np.object)
else:
terms = np.array(terms)
if len(terms):
indices, signs = _get_indices_and_signs(self.hasher, terms)
else:
Expand Down Expand Up @@ -225,14 +231,91 @@ def _invert_signs(signs):
return signs[0] < 0


def handle_hashing_vec(vec, feature_names, coef_scale):
def is_invhashing(vec):
return isinstance(vec, InvertableHashingVectorizer)


def handle_hashing_vec(vec, feature_names, coef_scale, with_coef_scale=True):
""" Return feature_names and coef_scale (if with_coef_scale is True),
calling .get_feature_names for invhashing vectorizers.
"""
needs_coef_scale = with_coef_scale and coef_scale is None
if is_invhashing(vec):
if feature_names is None:
feature_names = vec.get_feature_names(always_signed=False)
if coef_scale is None:
if needs_coef_scale:
coef_scale = vec.column_signs_
elif (isinstance(vec, FeatureUnion) and
any(is_invhashing(v) for _, v in vec.transformer_list) and
(needs_coef_scale or feature_names is None)):
_feature_names, _coef_scale = _invhashing_union_feature_names_scale(vec)
if feature_names is None:
feature_names = _feature_names
if needs_coef_scale:
coef_scale = _coef_scale
return (feature_names, coef_scale) if with_coef_scale else feature_names


def _invhashing_union_feature_names_scale(vec_union):
# type: (FeatureUnion) -> Tuple[FeatureNames, np.ndarray]
feature_names_store = {} # type: Dict[int, Union[str, List]]
unkn_template = None
shift = 0
coef_scale_values = []
for vec_name, vec in vec_union.transformer_list:
if isinstance(vec, InvertableHashingVectorizer):
vec_feature_names = vec.get_feature_names(always_signed=False)
unkn_template = vec_feature_names.unkn_template
for idx, fs in vec_feature_names.feature_names.items():
new_fs = []
for f in fs:
new_f = dict(f)
new_f['name'] = '{}__{}'.format(vec_name, f['name'])
new_fs.append(new_f)
feature_names_store[idx + shift] = new_fs
coef_scale_values.append((shift, vec.column_signs_))
shift += vec_feature_names.n_features
else:
vec_feature_names = vec.get_feature_names()
feature_names_store.update(
(shift + idx, '{}__{}'.format(vec_name, fname))
for idx, fname in enumerate(vec_feature_names))
shift += len(vec_feature_names)
n_features = shift
feature_names = FeatureNames(
feature_names=feature_names_store,
n_features=n_features,
unkn_template=unkn_template)
coef_scale = np.ones(n_features) * np.nan
for idx, values in coef_scale_values:
coef_scale[idx: idx + len(values)] = values
return feature_names, coef_scale


def is_invhashing(vec):
return isinstance(vec, InvertableHashingVectorizer)
def invert_hashing_and_fit(vec, docs):
# type: (Union[FeatureUnion, HashingVectorizer], Any) -> Union[FeatureUnion, InvertableHashingVectorizer]
""" Create an InvertableHashingVectorizer from hashing vectorizer vec
and fit it on docs. If vec is a FeatureUnion, do it for all
hashing vectorizers in the union.
Returns an InvertableHashingVectorizer, or a FeatureUnion,
or an unchanged vectorizer.
"""
if isinstance(vec, HashingVectorizer):
vec = InvertableHashingVectorizer(vec)
vec.fit(docs)
elif (isinstance(vec, FeatureUnion) and
any(isinstance(v, HashingVectorizer)
for _, v in vec.transformer_list)):
vec = _fit_invhashing_union(vec, docs)
return vec


def _fit_invhashing_union(vec_union, docs):
# type: (FeatureUnion, Any) -> FeatureUnion
""" Fit InvertableHashingVectorizer on doc inside a FeatureUnion.
"""
return FeatureUnion(
[(name, invert_hashing_and_fit(v, docs))
for name, v in vec_union.transformer_list],
transformer_weights=vec_union.transformer_weights,
n_jobs=vec_union.n_jobs)
16 changes: 7 additions & 9 deletions eli5/sklearn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
import numpy as np # type: ignore
import scipy.sparse as sp # type: ignore
from sklearn.multiclass import OneVsRestClassifier # type: ignore
from sklearn.feature_extraction.text import HashingVectorizer # type: ignore

from eli5.sklearn.unhashing import InvertableHashingVectorizer, is_invhashing
from eli5.sklearn.unhashing import invert_hashing_and_fit, handle_hashing_vec
from eli5._feature_names import FeatureNames


Expand Down Expand Up @@ -204,13 +203,12 @@ def get_X(doc, vec=None, vectorized=False, to_dense=False):


def handle_vec(clf, doc, vec, vectorized, feature_names, num_features=None):
if isinstance(vec, HashingVectorizer) and not vectorized:
vec = InvertableHashingVectorizer(vec)
vec.fit([doc])
if is_invhashing(vec) and feature_names is None:
# Explaining predictions does not need coef_scale,
# because it is handled by the vectorizer.
feature_names = vec.get_feature_names(always_signed=False)
if not vectorized:
vec = invert_hashing_and_fit(vec, [doc])
# Explaining predictions does not need coef_scale
# because it is handled by the vectorizer.
feature_names = handle_hashing_vec(
vec, feature_names, coef_scale=None, with_coef_scale=False)
feature_names = get_feature_names(
clf, vec, feature_names=feature_names, num_features=num_features)
return vec, feature_names
Expand Down
51 changes: 33 additions & 18 deletions tests/test_sklearn_vectorizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
import attr
import pytest

from eli5.sklearn import InvertableHashingVectorizer
from sklearn.base import BaseEstimator
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion

from eli5 import explain_prediction
from eli5 import explain_prediction, explain_weights
from eli5.formatters import format_as_html
from eli5.sklearn import invert_hashing_and_fit, InvertableHashingVectorizer
from .utils import format_as_all, get_all_features, get_names_coefs, write_html


Expand Down Expand Up @@ -165,7 +165,7 @@ def test_explain_regression_hashing_vectorizer(newsgroups_train_binary):

@pytest.mark.parametrize(['vec_cls'], [
[CountVectorizer],
#[HashingVectorizer],
[HashingVectorizer],
])
def test_explain_feature_union(vec_cls):
data = [
Expand All @@ -181,18 +181,25 @@ def test_explain_feature_union(vec_cls):
'text': 'security'},
]
ys = [1, 0, 0, 0, 1]
vec = FeatureUnion([
('url', vec_cls(preprocessor=lambda x: x['url'],
analyzer='char',
ngram_range=(3, 3))),
('text', vec_cls(preprocessor=lambda x: x['text'])),
])
url_vec = vec_cls(
preprocessor=lambda x: x['url'], analyzer='char', ngram_range=(3, 3))
text_vec = vec_cls(preprocessor=lambda x: x['text'])
vec = FeatureUnion([('url', url_vec), ('text', text_vec)])
xs = vec.fit_transform(data)
clf = LogisticRegression(random_state=42)
clf.fit(xs, ys)
res = explain_prediction(clf, data[0], vec)
html_expl = format_as_html(res, force_weights=False)
write_html(clf, html_expl, '')

ivec = invert_hashing_and_fit(vec, data)
weights_res = explain_weights(clf, ivec)
html_expl = format_as_html(weights_res)
write_html(clf, html_expl, '', postfix='{}_weights'.format(vec_cls.__name__))
assert 'text__security' in html_expl
assert 'url__log' in html_expl
assert 'BIAS' in html_expl

pred_res = explain_prediction(clf, data[0], vec)
html_expl = format_as_html(pred_res, force_weights=False)
write_html(clf, html_expl, '', postfix=vec_cls.__name__)
assert 'text: Highlighted in text (sum)' in html_expl
assert 'url: Highlighted in text (sum)' in html_expl
assert '<b>url:</b> <span' in html_expl
Expand All @@ -202,7 +209,7 @@ def test_explain_feature_union(vec_cls):

@pytest.mark.parametrize(['vec_cls'], [
[CountVectorizer],
#[HashingVectorizer],
[HashingVectorizer],
])
def test_explain_feature_union_with_nontext(vec_cls):
data = [
Expand All @@ -218,16 +225,24 @@ def test_explain_feature_union_with_nontext(vec_cls):
'text': 'security'},
]
ys = [1, 0, 0, 0, 1]
vec = FeatureUnion([
('score', DictVectorizer()),
('text', vec_cls(preprocessor=lambda x: x['text'])),
])
score_vec = DictVectorizer()
text_vec = vec_cls(preprocessor=lambda x: x['text'])
vec = FeatureUnion([('score', score_vec), ('text', text_vec)])
xs = vec.fit_transform(data)
clf = LogisticRegression(random_state=42)
clf.fit(xs, ys)

ivec = invert_hashing_and_fit(vec, data)
weights_res = explain_weights(clf, ivec)
html_expl = format_as_html(weights_res)
write_html(clf, html_expl, '', postfix='{}_weights'.format(vec_cls.__name__))
assert 'score__score' in html_expl
assert 'text__security' in html_expl
assert 'BIAS' in html_expl

res = explain_prediction(clf, data[0], vec)
html_expl = format_as_html(res, force_weights=False)
write_html(clf, html_expl, '')
write_html(clf, html_expl, '', postfix=vec_cls.__name__)
assert 'text: Highlighted in text (sum)' in html_expl
assert '<b>text:</b> <span' in html_expl
assert 'BIAS' in html_expl
Expand Down