Merge pull request #176 from TeamHG-Memex/union-hashing-vec-3

Support hashing vectorizer inside a union
TeamHG-Memex · May 2, 2017 · 73f0ac2 · 73f0ac2
2 parents d9882cb + b89d0f3
commit 73f0ac2
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 34 deletions.
diff --git a/docs/source/libraries/sklearn.rst b/docs/source/libraries/sklearn.rst
@@ -219,6 +219,17 @@ automatically; to handle HashingVectorizer_ and FeatureHasher_ for
     # and ``ivec`` can be used as a vectorizer for eli5.explain_weights:
     eli5.explain_weights(clf, vec=ivec)
 
+HashingVectorizer_ is also supported inside a FeatureUnion_:
+:func:`eli5.explain_prediction` handles this case automatically, and for
+:func:`eli5.explain_weights` you can use :func:`eli5.sklearn.invert_hashing_and_fit`
+(it works for plain HashingVectorizer_ too) - it tears FeatureUnion_ apart,
+inverts and fits all hashing vectorizers and returns a new FeatureUnion_::
+
+    from eli5.sklearn import invert_hashing_and_fit
+
+    ivec = invert_hashing_and_fit(vec, X_sample)
+    eli5.explain_weights(clf, vec=ivec)
+
 .. _FeatureHasher: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html#sklearn.feature_extraction.FeatureHasher
 
 .. _sklearn-text-highlighting:

diff --git a/eli5/sklearn/__init__.py b/eli5/sklearn/__init__.py
@@ -12,4 +12,8 @@
     explain_prediction_linear_classifier,
     explain_prediction_linear_regressor,
 )
-from .unhashing import InvertableHashingVectorizer, FeatureUnhasher
+from .unhashing import (
+    InvertableHashingVectorizer,
+    FeatureUnhasher,
+    invert_hashing_and_fit,
+)
diff --git a/eli5/sklearn/unhashing.py b/eli5/sklearn/unhashing.py
@@ -5,14 +5,16 @@
 from __future__ import absolute_import
 from collections import defaultdict, Counter
 from itertools import chain
-from typing import List, Iterable, Any, Dict
+from typing import List, Iterable, Any, Dict, Tuple, Union
 
 import numpy as np  # type: ignore
+import six
 from sklearn.base import BaseEstimator, TransformerMixin  # type: ignore
 from sklearn.feature_extraction.text import (  # type: ignore
     HashingVectorizer,
     FeatureHasher,
 )
+from sklearn.pipeline import FeatureUnion  # type: ignore
 
 from eli5._feature_names import FeatureNames
 
@@ -161,7 +163,11 @@ def recalculate_attributes(self, force=False):
         """
         if not self._attributes_dirty and not force:
             return
-        terms = np.array([term for term, _ in self._term_counts.most_common()])
+        terms = [term for term, _ in self._term_counts.most_common()]
+        if six.PY2:
+            terms = np.array(terms, dtype=np.object)
+        else:
+            terms = np.array(terms)
         if len(terms):
             indices, signs = _get_indices_and_signs(self.hasher, terms)
         else:
@@ -225,14 +231,91 @@ def _invert_signs(signs):
     return signs[0] < 0
 
 
-def handle_hashing_vec(vec, feature_names, coef_scale):
+def is_invhashing(vec):
+    return isinstance(vec, InvertableHashingVectorizer)
+
+
+def handle_hashing_vec(vec, feature_names, coef_scale, with_coef_scale=True):
+    """ Return feature_names and coef_scale (if with_coef_scale is True),
+    calling .get_feature_names for invhashing vectorizers.
+    """
+    needs_coef_scale = with_coef_scale and coef_scale is None
     if is_invhashing(vec):
         if feature_names is None:
             feature_names = vec.get_feature_names(always_signed=False)
-        if coef_scale is None:
+        if needs_coef_scale:
             coef_scale = vec.column_signs_
+    elif (isinstance(vec, FeatureUnion) and
+              any(is_invhashing(v) for _, v in vec.transformer_list) and
+              (needs_coef_scale or feature_names is None)):
+        _feature_names, _coef_scale = _invhashing_union_feature_names_scale(vec)
+        if feature_names is None:
+            feature_names = _feature_names
+        if needs_coef_scale:
+            coef_scale = _coef_scale
+    return (feature_names, coef_scale) if with_coef_scale else feature_names
+
+
+def _invhashing_union_feature_names_scale(vec_union):
+    # type: (FeatureUnion) -> Tuple[FeatureNames, np.ndarray]
+    feature_names_store = {}  # type: Dict[int, Union[str, List]]
+    unkn_template = None
+    shift = 0
+    coef_scale_values = []
+    for vec_name, vec in vec_union.transformer_list:
+        if isinstance(vec, InvertableHashingVectorizer):
+            vec_feature_names = vec.get_feature_names(always_signed=False)
+            unkn_template = vec_feature_names.unkn_template
+            for idx, fs in vec_feature_names.feature_names.items():
+                new_fs = []
+                for f in fs:
+                    new_f = dict(f)
+                    new_f['name'] = '{}__{}'.format(vec_name, f['name'])
+                    new_fs.append(new_f)
+                feature_names_store[idx + shift] = new_fs
+            coef_scale_values.append((shift, vec.column_signs_))
+            shift += vec_feature_names.n_features
+        else:
+            vec_feature_names = vec.get_feature_names()
+            feature_names_store.update(
+                (shift + idx, '{}__{}'.format(vec_name, fname))
+                for idx, fname in enumerate(vec_feature_names))
+            shift += len(vec_feature_names)
+    n_features = shift
+    feature_names = FeatureNames(
+        feature_names=feature_names_store,
+        n_features=n_features,
+        unkn_template=unkn_template)
+    coef_scale = np.ones(n_features) * np.nan
+    for idx, values in coef_scale_values:
+        coef_scale[idx: idx + len(values)] = values
     return feature_names, coef_scale
 
 
-def is_invhashing(vec):
-    return isinstance(vec, InvertableHashingVectorizer)
+def invert_hashing_and_fit(vec, docs):
+    # type: (Union[FeatureUnion, HashingVectorizer], Any) -> Union[FeatureUnion, InvertableHashingVectorizer]
+    """ Create an InvertableHashingVectorizer from hashing vectorizer vec
+    and fit it on docs. If vec is a FeatureUnion, do it for all
+    hashing vectorizers in the union.
+    Returns an InvertableHashingVectorizer, or a FeatureUnion,
+    or an unchanged vectorizer.
+    """
+    if isinstance(vec, HashingVectorizer):
+        vec = InvertableHashingVectorizer(vec)
+        vec.fit(docs)
+    elif (isinstance(vec, FeatureUnion) and
+              any(isinstance(v, HashingVectorizer)
+                  for _, v in vec.transformer_list)):
+        vec = _fit_invhashing_union(vec, docs)
+    return vec
+
+
+def _fit_invhashing_union(vec_union, docs):
+    # type: (FeatureUnion, Any) -> FeatureUnion
+    """ Fit InvertableHashingVectorizer on doc inside a FeatureUnion.
+    """
+    return FeatureUnion(
+        [(name, invert_hashing_and_fit(v, docs))
+         for name, v in vec_union.transformer_list],
+        transformer_weights=vec_union.transformer_weights,
+        n_jobs=vec_union.n_jobs)
diff --git a/eli5/sklearn/utils.py b/eli5/sklearn/utils.py
@@ -5,9 +5,8 @@
 import numpy as np  # type: ignore
 import scipy.sparse as sp  # type: ignore
 from sklearn.multiclass import OneVsRestClassifier  # type: ignore
-from sklearn.feature_extraction.text import HashingVectorizer  # type: ignore
 
-from eli5.sklearn.unhashing import InvertableHashingVectorizer, is_invhashing
+from eli5.sklearn.unhashing import invert_hashing_and_fit, handle_hashing_vec
 from eli5._feature_names import FeatureNames
 
 
@@ -204,13 +203,12 @@ def get_X(doc, vec=None, vectorized=False, to_dense=False):
 
 
 def handle_vec(clf, doc, vec, vectorized, feature_names, num_features=None):
-    if isinstance(vec, HashingVectorizer) and not vectorized:
-        vec = InvertableHashingVectorizer(vec)
-        vec.fit([doc])
-    if is_invhashing(vec) and feature_names is None:
-        # Explaining predictions does not need coef_scale,
-        # because it is handled by the vectorizer.
-        feature_names = vec.get_feature_names(always_signed=False)
+    if not vectorized:
+        vec = invert_hashing_and_fit(vec, [doc])
+    # Explaining predictions does not need coef_scale
+    # because it is handled by the vectorizer.
+    feature_names = handle_hashing_vec(
+        vec, feature_names, coef_scale=None, with_coef_scale=False)
     feature_names = get_feature_names(
         clf, vec, feature_names=feature_names, num_features=num_features)
     return vec, feature_names

diff --git a/tests/test_sklearn_vectorizers.py b/tests/test_sklearn_vectorizers.py
@@ -5,16 +5,16 @@
 import attr
 import pytest
 
-from eli5.sklearn import InvertableHashingVectorizer
 from sklearn.base import BaseEstimator
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
 from sklearn.linear_model import LinearRegression
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import FeatureUnion
 
-from eli5 import explain_prediction
+from eli5 import explain_prediction, explain_weights
 from eli5.formatters import format_as_html
+from eli5.sklearn import invert_hashing_and_fit, InvertableHashingVectorizer
 from .utils import format_as_all, get_all_features, get_names_coefs, write_html
 
 
@@ -165,7 +165,7 @@ def test_explain_regression_hashing_vectorizer(newsgroups_train_binary):
 
 @pytest.mark.parametrize(['vec_cls'], [
     [CountVectorizer],
-   #[HashingVectorizer],
+    [HashingVectorizer],
 ])
 def test_explain_feature_union(vec_cls):
     data = [
@@ -181,18 +181,25 @@ def test_explain_feature_union(vec_cls):
          'text': 'security'},
         ]
     ys = [1, 0, 0, 0, 1]
-    vec = FeatureUnion([
-        ('url', vec_cls(preprocessor=lambda x: x['url'],
-                        analyzer='char',
-                        ngram_range=(3, 3))),
-        ('text', vec_cls(preprocessor=lambda x: x['text'])),
-    ])
+    url_vec = vec_cls(
+        preprocessor=lambda x: x['url'], analyzer='char', ngram_range=(3, 3))
+    text_vec = vec_cls(preprocessor=lambda x: x['text'])
+    vec = FeatureUnion([('url', url_vec), ('text', text_vec)])
     xs = vec.fit_transform(data)
     clf = LogisticRegression(random_state=42)
     clf.fit(xs, ys)
-    res = explain_prediction(clf, data[0], vec)
-    html_expl = format_as_html(res, force_weights=False)
-    write_html(clf, html_expl, '')
+
+    ivec = invert_hashing_and_fit(vec, data)
+    weights_res = explain_weights(clf, ivec)
+    html_expl = format_as_html(weights_res)
+    write_html(clf, html_expl, '', postfix='{}_weights'.format(vec_cls.__name__))
+    assert 'text__security' in html_expl
+    assert 'url__log' in html_expl
+    assert 'BIAS' in html_expl
+
+    pred_res = explain_prediction(clf, data[0], vec)
+    html_expl = format_as_html(pred_res, force_weights=False)
+    write_html(clf, html_expl, '', postfix=vec_cls.__name__)
     assert 'text: Highlighted in text (sum)' in html_expl
     assert 'url: Highlighted in text (sum)' in html_expl
     assert '<b>url:</b> <span' in html_expl
@@ -202,7 +209,7 @@ def test_explain_feature_union(vec_cls):
 
 @pytest.mark.parametrize(['vec_cls'], [
     [CountVectorizer],
-    #[HashingVectorizer],
+    [HashingVectorizer],
 ])
 def test_explain_feature_union_with_nontext(vec_cls):
     data = [
@@ -218,16 +225,24 @@ def test_explain_feature_union_with_nontext(vec_cls):
          'text': 'security'},
     ]
     ys = [1, 0, 0, 0, 1]
-    vec = FeatureUnion([
-        ('score', DictVectorizer()),
-        ('text', vec_cls(preprocessor=lambda x: x['text'])),
-    ])
+    score_vec = DictVectorizer()
+    text_vec = vec_cls(preprocessor=lambda x: x['text'])
+    vec = FeatureUnion([('score', score_vec), ('text', text_vec)])
     xs = vec.fit_transform(data)
     clf = LogisticRegression(random_state=42)
     clf.fit(xs, ys)
+
+    ivec = invert_hashing_and_fit(vec, data)
+    weights_res = explain_weights(clf, ivec)
+    html_expl = format_as_html(weights_res)
+    write_html(clf, html_expl, '', postfix='{}_weights'.format(vec_cls.__name__))
+    assert 'score__score' in html_expl
+    assert 'text__security' in html_expl
+    assert 'BIAS' in html_expl
+
     res = explain_prediction(clf, data[0], vec)
     html_expl = format_as_html(res, force_weights=False)
-    write_html(clf, html_expl, '')
+    write_html(clf, html_expl, '', postfix=vec_cls.__name__)
     assert 'text: Highlighted in text (sum)' in html_expl
     assert '<b>text:</b> <span' in html_expl
     assert 'BIAS' in html_expl