Skip to content

Commit

Permalink
Merge pull request #176 from TeamHG-Memex/union-hashing-vec-3
Browse files Browse the repository at this point in the history
Support hashing vectorizer inside a union
  • Loading branch information
kmike authored May 2, 2017
2 parents d9882cb + b89d0f3 commit 73f0ac2
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 34 deletions.
11 changes: 11 additions & 0 deletions docs/source/libraries/sklearn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,17 @@ automatically; to handle HashingVectorizer_ and FeatureHasher_ for
# and ``ivec`` can be used as a vectorizer for eli5.explain_weights:
eli5.explain_weights(clf, vec=ivec)

HashingVectorizer_ is also supported inside a FeatureUnion_:
:func:`eli5.explain_prediction` handles this case automatically, and for
:func:`eli5.explain_weights` you can use :func:`eli5.sklearn.invert_hashing_and_fit`
(it works for plain HashingVectorizer_ too) - it tears FeatureUnion_ apart,
inverts and fits all hashing vectorizers and returns a new FeatureUnion_::

from eli5.sklearn import invert_hashing_and_fit

ivec = invert_hashing_and_fit(vec, X_sample)
eli5.explain_weights(clf, vec=ivec)

.. _FeatureHasher: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html#sklearn.feature_extraction.FeatureHasher

.. _sklearn-text-highlighting:
Expand Down
6 changes: 5 additions & 1 deletion eli5/sklearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@
explain_prediction_linear_classifier,
explain_prediction_linear_regressor,
)
from .unhashing import InvertableHashingVectorizer, FeatureUnhasher
from .unhashing import (
InvertableHashingVectorizer,
FeatureUnhasher,
invert_hashing_and_fit,
)
95 changes: 89 additions & 6 deletions eli5/sklearn/unhashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
from __future__ import absolute_import
from collections import defaultdict, Counter
from itertools import chain
from typing import List, Iterable, Any, Dict
from typing import List, Iterable, Any, Dict, Tuple, Union

import numpy as np # type: ignore
import six
from sklearn.base import BaseEstimator, TransformerMixin # type: ignore
from sklearn.feature_extraction.text import ( # type: ignore
HashingVectorizer,
FeatureHasher,
)
from sklearn.pipeline import FeatureUnion # type: ignore

from eli5._feature_names import FeatureNames

Expand Down Expand Up @@ -161,7 +163,11 @@ def recalculate_attributes(self, force=False):
"""
if not self._attributes_dirty and not force:
return
terms = np.array([term for term, _ in self._term_counts.most_common()])
terms = [term for term, _ in self._term_counts.most_common()]
if six.PY2:
terms = np.array(terms, dtype=np.object)
else:
terms = np.array(terms)
if len(terms):
indices, signs = _get_indices_and_signs(self.hasher, terms)
else:
Expand Down Expand Up @@ -225,14 +231,91 @@ def _invert_signs(signs):
return signs[0] < 0


def handle_hashing_vec(vec, feature_names, coef_scale):
def is_invhashing(vec):
return isinstance(vec, InvertableHashingVectorizer)


def handle_hashing_vec(vec, feature_names, coef_scale, with_coef_scale=True):
""" Return feature_names and coef_scale (if with_coef_scale is True),
calling .get_feature_names for invhashing vectorizers.
"""
needs_coef_scale = with_coef_scale and coef_scale is None
if is_invhashing(vec):
if feature_names is None:
feature_names = vec.get_feature_names(always_signed=False)
if coef_scale is None:
if needs_coef_scale:
coef_scale = vec.column_signs_
elif (isinstance(vec, FeatureUnion) and
any(is_invhashing(v) for _, v in vec.transformer_list) and
(needs_coef_scale or feature_names is None)):
_feature_names, _coef_scale = _invhashing_union_feature_names_scale(vec)
if feature_names is None:
feature_names = _feature_names
if needs_coef_scale:
coef_scale = _coef_scale
return (feature_names, coef_scale) if with_coef_scale else feature_names


def _invhashing_union_feature_names_scale(vec_union):
# type: (FeatureUnion) -> Tuple[FeatureNames, np.ndarray]
feature_names_store = {} # type: Dict[int, Union[str, List]]
unkn_template = None
shift = 0
coef_scale_values = []
for vec_name, vec in vec_union.transformer_list:
if isinstance(vec, InvertableHashingVectorizer):
vec_feature_names = vec.get_feature_names(always_signed=False)
unkn_template = vec_feature_names.unkn_template
for idx, fs in vec_feature_names.feature_names.items():
new_fs = []
for f in fs:
new_f = dict(f)
new_f['name'] = '{}__{}'.format(vec_name, f['name'])
new_fs.append(new_f)
feature_names_store[idx + shift] = new_fs
coef_scale_values.append((shift, vec.column_signs_))
shift += vec_feature_names.n_features
else:
vec_feature_names = vec.get_feature_names()
feature_names_store.update(
(shift + idx, '{}__{}'.format(vec_name, fname))
for idx, fname in enumerate(vec_feature_names))
shift += len(vec_feature_names)
n_features = shift
feature_names = FeatureNames(
feature_names=feature_names_store,
n_features=n_features,
unkn_template=unkn_template)
coef_scale = np.ones(n_features) * np.nan
for idx, values in coef_scale_values:
coef_scale[idx: idx + len(values)] = values
return feature_names, coef_scale


def is_invhashing(vec):
return isinstance(vec, InvertableHashingVectorizer)
def invert_hashing_and_fit(vec, docs):
# type: (Union[FeatureUnion, HashingVectorizer], Any) -> Union[FeatureUnion, InvertableHashingVectorizer]
""" Create an InvertableHashingVectorizer from hashing vectorizer vec
and fit it on docs. If vec is a FeatureUnion, do it for all
hashing vectorizers in the union.
Returns an InvertableHashingVectorizer, or a FeatureUnion,
or an unchanged vectorizer.
"""
if isinstance(vec, HashingVectorizer):
vec = InvertableHashingVectorizer(vec)
vec.fit(docs)
elif (isinstance(vec, FeatureUnion) and
any(isinstance(v, HashingVectorizer)
for _, v in vec.transformer_list)):
vec = _fit_invhashing_union(vec, docs)
return vec


def _fit_invhashing_union(vec_union, docs):
# type: (FeatureUnion, Any) -> FeatureUnion
""" Fit InvertableHashingVectorizer on doc inside a FeatureUnion.
"""
return FeatureUnion(
[(name, invert_hashing_and_fit(v, docs))
for name, v in vec_union.transformer_list],
transformer_weights=vec_union.transformer_weights,
n_jobs=vec_union.n_jobs)
16 changes: 7 additions & 9 deletions eli5/sklearn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
import numpy as np # type: ignore
import scipy.sparse as sp # type: ignore
from sklearn.multiclass import OneVsRestClassifier # type: ignore
from sklearn.feature_extraction.text import HashingVectorizer # type: ignore

from eli5.sklearn.unhashing import InvertableHashingVectorizer, is_invhashing
from eli5.sklearn.unhashing import invert_hashing_and_fit, handle_hashing_vec
from eli5._feature_names import FeatureNames


Expand Down Expand Up @@ -204,13 +203,12 @@ def get_X(doc, vec=None, vectorized=False, to_dense=False):


def handle_vec(clf, doc, vec, vectorized, feature_names, num_features=None):
if isinstance(vec, HashingVectorizer) and not vectorized:
vec = InvertableHashingVectorizer(vec)
vec.fit([doc])
if is_invhashing(vec) and feature_names is None:
# Explaining predictions does not need coef_scale,
# because it is handled by the vectorizer.
feature_names = vec.get_feature_names(always_signed=False)
if not vectorized:
vec = invert_hashing_and_fit(vec, [doc])
# Explaining predictions does not need coef_scale
# because it is handled by the vectorizer.
feature_names = handle_hashing_vec(
vec, feature_names, coef_scale=None, with_coef_scale=False)
feature_names = get_feature_names(
clf, vec, feature_names=feature_names, num_features=num_features)
return vec, feature_names
Expand Down
51 changes: 33 additions & 18 deletions tests/test_sklearn_vectorizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
import attr
import pytest

from eli5.sklearn import InvertableHashingVectorizer
from sklearn.base import BaseEstimator
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion

from eli5 import explain_prediction
from eli5 import explain_prediction, explain_weights
from eli5.formatters import format_as_html
from eli5.sklearn import invert_hashing_and_fit, InvertableHashingVectorizer
from .utils import format_as_all, get_all_features, get_names_coefs, write_html


Expand Down Expand Up @@ -165,7 +165,7 @@ def test_explain_regression_hashing_vectorizer(newsgroups_train_binary):

@pytest.mark.parametrize(['vec_cls'], [
[CountVectorizer],
#[HashingVectorizer],
[HashingVectorizer],
])
def test_explain_feature_union(vec_cls):
data = [
Expand All @@ -181,18 +181,25 @@ def test_explain_feature_union(vec_cls):
'text': 'security'},
]
ys = [1, 0, 0, 0, 1]
vec = FeatureUnion([
('url', vec_cls(preprocessor=lambda x: x['url'],
analyzer='char',
ngram_range=(3, 3))),
('text', vec_cls(preprocessor=lambda x: x['text'])),
])
url_vec = vec_cls(
preprocessor=lambda x: x['url'], analyzer='char', ngram_range=(3, 3))
text_vec = vec_cls(preprocessor=lambda x: x['text'])
vec = FeatureUnion([('url', url_vec), ('text', text_vec)])
xs = vec.fit_transform(data)
clf = LogisticRegression(random_state=42)
clf.fit(xs, ys)
res = explain_prediction(clf, data[0], vec)
html_expl = format_as_html(res, force_weights=False)
write_html(clf, html_expl, '')

ivec = invert_hashing_and_fit(vec, data)
weights_res = explain_weights(clf, ivec)
html_expl = format_as_html(weights_res)
write_html(clf, html_expl, '', postfix='{}_weights'.format(vec_cls.__name__))
assert 'text__security' in html_expl
assert 'url__log' in html_expl
assert 'BIAS' in html_expl

pred_res = explain_prediction(clf, data[0], vec)
html_expl = format_as_html(pred_res, force_weights=False)
write_html(clf, html_expl, '', postfix=vec_cls.__name__)
assert 'text: Highlighted in text (sum)' in html_expl
assert 'url: Highlighted in text (sum)' in html_expl
assert '<b>url:</b> <span' in html_expl
Expand All @@ -202,7 +209,7 @@ def test_explain_feature_union(vec_cls):

@pytest.mark.parametrize(['vec_cls'], [
[CountVectorizer],
#[HashingVectorizer],
[HashingVectorizer],
])
def test_explain_feature_union_with_nontext(vec_cls):
data = [
Expand All @@ -218,16 +225,24 @@ def test_explain_feature_union_with_nontext(vec_cls):
'text': 'security'},
]
ys = [1, 0, 0, 0, 1]
vec = FeatureUnion([
('score', DictVectorizer()),
('text', vec_cls(preprocessor=lambda x: x['text'])),
])
score_vec = DictVectorizer()
text_vec = vec_cls(preprocessor=lambda x: x['text'])
vec = FeatureUnion([('score', score_vec), ('text', text_vec)])
xs = vec.fit_transform(data)
clf = LogisticRegression(random_state=42)
clf.fit(xs, ys)

ivec = invert_hashing_and_fit(vec, data)
weights_res = explain_weights(clf, ivec)
html_expl = format_as_html(weights_res)
write_html(clf, html_expl, '', postfix='{}_weights'.format(vec_cls.__name__))
assert 'score__score' in html_expl
assert 'text__security' in html_expl
assert 'BIAS' in html_expl

res = explain_prediction(clf, data[0], vec)
html_expl = format_as_html(res, force_weights=False)
write_html(clf, html_expl, '')
write_html(clf, html_expl, '', postfix=vec_cls.__name__)
assert 'text: Highlighted in text (sum)' in html_expl
assert '<b>text:</b> <span' in html_expl
assert 'BIAS' in html_expl
Expand Down

0 comments on commit 73f0ac2

Please sign in to comment.