Skip to content

Commit

Permalink
Merge pull request #3569 from janezd/jaccard-nonbinary
Browse files Browse the repository at this point in the history
[FIX] OWDistances: Use only binary features for Jaccard distance
  • Loading branch information
lanzagar authored Feb 15, 2019
2 parents a5e85be + 894aa1b commit f155396
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 7 deletions.
3 changes: 2 additions & 1 deletion Orange/distance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute,
Mahalanobis, MahalanobisDistance, Hamming)

from .base import _preprocess, remove_discrete_features, impute
from .base import (
_preprocess, remove_discrete_features, remove_nonbinary_features, impute)
9 changes: 9 additions & 0 deletions Orange/distance/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ def remove_discrete_features(data):
return data.transform(new_domain)


def remove_nonbinary_features(data):
"""Remove non-binary columns from the data."""
new_domain = Domain(
[a for a in data.domain.attributes
if a.is_discrete and len(a.values) == 2],
data.domain.class_vars,
data.domain.metas)
return data.transform(new_domain)

def impute(data):
"""Impute missing values."""
return SklImpute()(data)
Expand Down
25 changes: 22 additions & 3 deletions Orange/widgets/unsupervised/owdistances.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,14 @@ class Outputs:

class Error(OWWidget.Error):
no_continuous_features = Msg("No numeric features")
no_binary_features = Msg("No binary features")
dense_metric_sparse_data = Msg("{} requires dense data.")
distances_memory_error = Msg("Not enough memory")
distances_value_error = Msg("Problem in calculation:\n{}")

class Warning(OWWidget.Warning):
ignoring_discrete = Msg("Ignoring categorical features")
ignoring_nonbinary = Msg("Ignoring non-binary features")
imputing_data = Msg("Missing values were imputed")

def __init__(self):
Expand Down Expand Up @@ -112,30 +114,47 @@ def _check_sparse():
if issparse(data.X) and not metric.supports_sparse:
self.Error.dense_metric_sparse_data(METRICS[self.metric_idx][0])
return False
return True

def _fix_discrete():
nonlocal data
if data.domain.has_discrete_attributes() and (
issparse(data.X) and getattr(metric, "fallback", None)
or not metric.supports_discrete
or self.axis == 1):
or self.axis == 1 and metric is not distance.Jaccard):
if not data.domain.has_continuous_attributes():
self.Error.no_continuous_features()
return False
self.Warning.ignoring_discrete()
data = distance.remove_discrete_features(data)
return True

def _fix_nonbinary():
nonlocal data
if metric is distance.Jaccard:
nbinary = sum(a.is_discrete and len(a.values) == 2
for a in data.domain.attributes)
if not nbinary:
self.Error.no_binary_features()
return False
elif nbinary < len(data.domain.attributes):
self.Warning.ignoring_nonbinary()
data = distance.remove_nonbinary_features(data)
return True

def _fix_missing():
nonlocal data
if not metric.supports_missing and bn.anynan(data.X):
self.Warning.imputing_data()
data = distance.impute(data)
return True

self.clear_messages()
if data is None:
return
for check in (_check_sparse, _fix_discrete, _fix_missing):
if check() is False:
for check in (_check_sparse,
_fix_discrete, _fix_missing, _fix_nonbinary):
if not check():
return
try:
if metric.supports_normalization and self.normalized_dist:
Expand Down
42 changes: 39 additions & 3 deletions Orange/widgets/unsupervised/tests/test_owdistances.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

import numpy as np

from Orange.data import Table
from Orange.data import Table, Domain
from Orange import distance
from Orange.widgets.unsupervised.owdistances import OWDistances, METRICS
from Orange.widgets.tests.base import WidgetTest

Expand Down Expand Up @@ -32,8 +33,9 @@ def test_distance_combo(self):
else:
expected = metric(self.iris)

np.testing.assert_array_equal(
expected, self.get_output(self.widget.Outputs.distances))
if metric is not distance.Jaccard:
np.testing.assert_array_equal(
expected, self.get_output(self.widget.Outputs.distances))

def test_error_message(self):
"""Check if error message appears and then disappears when
Expand All @@ -46,6 +48,40 @@ def test_error_message(self):
self.send_signal(self.widget.Inputs.data, None)
self.assertFalse(self.widget.Error.no_continuous_features.is_shown())

def test_jaccard_messages(self):
for self.widget.metric_idx, (name, _) in enumerate(METRICS):
if name == "Jaccard":
break
self.send_signal(self.widget.Inputs.data, self.iris)
self.assertTrue(self.widget.Error.no_binary_features.is_shown())
self.assertFalse(self.widget.Warning.ignoring_nonbinary.is_shown())

self.send_signal(self.widget.Inputs.data, None)
self.assertFalse(self.widget.Error.no_binary_features.is_shown())
self.assertFalse(self.widget.Warning.ignoring_nonbinary.is_shown())

self.send_signal(self.widget.Inputs.data, self.titanic)
self.assertFalse(self.widget.Error.no_binary_features.is_shown())
self.assertTrue(self.widget.Warning.ignoring_nonbinary.is_shown())

self.send_signal(self.widget.Inputs.data, None)
self.assertFalse(self.widget.Error.no_binary_features.is_shown())
self.assertFalse(self.widget.Warning.ignoring_nonbinary.is_shown())

self.send_signal(self.widget.Inputs.data, self.titanic)
self.assertFalse(self.widget.Error.no_binary_features.is_shown())
self.assertTrue(self.widget.Warning.ignoring_nonbinary.is_shown())

dom = self.titanic.domain
dom = Domain(dom.attributes[1:], dom.class_var)
self.send_signal(self.widget.Inputs.data, self.titanic.transform(dom))
self.assertFalse(self.widget.Error.no_binary_features.is_shown())
self.assertFalse(self.widget.Warning.ignoring_nonbinary.is_shown())

self.send_signal(self.widget.Inputs.data, Table("heart_disease"))
self.assertFalse(self.widget.Error.no_binary_features.is_shown())
self.assertFalse(self.widget.Warning.ignoring_discrete.is_shown())

def test_too_big_array(self):
"""
Users sees an error message when calculating too large arrays and Orange
Expand Down

0 comments on commit f155396

Please sign in to comment.