From a89beeeebbea950e0b53a2f40cabe77ebf1e0d20 Mon Sep 17 00:00:00 2001 From: janezd Date: Thu, 27 Feb 2020 20:49:22 +0100 Subject: [PATCH] OWContinuize: Provide the same options as in Preprocess/Normalize --- Orange/widgets/data/owcontinuize.py | 223 ++++++++---------- .../widgets/data/tests/test_owcontinuize.py | 121 +++++++--- 2 files changed, 185 insertions(+), 159 deletions(-) diff --git a/Orange/widgets/data/owcontinuize.py b/Orange/widgets/data/owcontinuize.py index 5573d81eba8..9a7ece80711 100644 --- a/Orange/widgets/data/owcontinuize.py +++ b/Orange/widgets/data/owcontinuize.py @@ -1,13 +1,13 @@ from functools import reduce +from types import SimpleNamespace from AnyQt.QtCore import Qt import Orange.data from Orange.util import Reprable from Orange.statistics import distribution -from Orange.preprocess import Continuize, Normalize -from Orange.preprocess.transformation import \ - Identity, Indicator, Indicator1, Normalizer +from Orange.preprocess import Continuize +from Orange.preprocess.transformation import Identity, Indicator, Normalizer from Orange.data.table import Table from Orange.widgets import gui, widget from Orange.widgets.settings import Setting @@ -34,16 +34,13 @@ class Outputs: buttons_area_orientation = Qt.Vertical resizing_enabled = False - # continuous treats - Leave, NormalizeBySpan, NormalizeBySD = range(3) + Normalize = SimpleNamespace(Leave=0, Standardize=1, Center=2, Scale=3, + Normalize11=4, Normalize01=5) + settings_version = 2 multinomial_treatment = Setting(0) - zero_based = Setting(1) - continuous_treatment = Setting(Leave) + continuous_treatment = Setting(Normalize.Leave) class_treatment = Setting(0) - - transform_class = Setting(False) - autosend = Setting(True) multinomial_treats = ( @@ -56,9 +53,13 @@ class Outputs: ("Divide by number of values", Continuize.AsNormalizedOrdinal)) continuous_treats = ( - ("Leave them as they are", Continuize.Leave), - ("Normalize by span", Normalize.NormalizeBySpan), - ("Normalize by standard deviation", Normalize.NormalizeBySD)) + ("Leave them as they are", True), + ("Standardize to μ=0, σ²=1", False), + ("Center to μ=0", False), + ("Scale to σ²=1", True), + ("Normalize to interval [-1, 1]", False), + ("Normalize to interval [0, 1]", False) + ) class_treats = ( ("Leave it as it is", Continuize.Leave), @@ -67,8 +68,6 @@ class Outputs: ("One class per value", Continuize.Indicators), ) - value_ranges = ["From -1 to 1", "From 0 to 1"] - def __init__(self): super().__init__() @@ -84,19 +83,12 @@ def __init__(self): btnLabels=[x[0] for x in self.continuous_treats], callback=self.settings_changed) - box = gui.vBox(self.controlArea, "Categorical Outcomes") + box = gui.vBox(self.controlArea, "Categorical Outcome(s)") gui.radioButtonsInBox( box, self, "class_treatment", btnLabels=[t[0] for t in self.class_treats], callback=self.settings_changed) - zbbox = gui.vBox(self.controlArea, "Value Range") - - gui.radioButtonsInBox( - zbbox, self, "zero_based", - btnLabels=self.value_ranges, - callback=self.settings_changed) - gui.auto_apply(self.buttonsArea, self, "autosend", box=False) self.data = None @@ -120,31 +112,27 @@ def setData(self, data): self.unconditional_commit() def enable_normalization(self): - enable = not (self.data and self.data.is_sparse()) - if not enable and self.continuous_treatment in (self.NormalizeBySpan, - self.NormalizeBySD): - self.continuous_treatment = self.Leave buttons = self.controls.continuous_treatment.buttons - buttons[self.NormalizeBySpan].setEnabled(enable) - buttons[self.NormalizeBySD].setEnabled(enable) + if self.data is not None and self.data.is_sparse(): + if self.continuous_treatment == self.Normalize.Standardize: + self.continuous_treatment = self.Normalize.Scale + else: + self.continuous_treatment = self.Normalize.Leave + for button, (_, supports_sparse) \ + in zip(buttons, self.continuous_treats): + button.setEnabled(supports_sparse) + else: + for button in buttons: + button.setEnabled(True) def constructContinuizer(self): conzer = DomainContinuizer( - zero_based=self.zero_based, multinomial_treatment=self.multinomial_treats[self.multinomial_treatment][1], - continuous_treatment=self.continuous_treats[self.continuous_treatment][1], + continuous_treatment=self.continuous_treatment, class_treatment=self.class_treats[self.class_treatment][1] ) return conzer - # def sendPreprocessor(self): - # continuizer = self.constructContinuizer() - # self.send("Preprocessor", PreprocessedLearner( - # lambda data, weightId=0, tc=(self.targetValue if self.classTreatment else -1): - # Table(continuizer(data, weightId, tc) - # if data.domain.has_discrete_class - # else continuizer(data, weightId), data))) - def commit(self): continuizer = self.constructContinuizer() if self.data: @@ -155,7 +143,6 @@ def commit(self): else: self.Outputs.data.send(self.data) # None or empty data - def send_report(self): self.report_items( "Settings", @@ -163,8 +150,21 @@ def send_report(self): self.multinomial_treats[self.multinomial_treatment][0]), ("Numeric features", self.continuous_treats[self.continuous_treatment][0]), - ("Class", self.class_treats[self.class_treatment][0]), - ("Value range", self.value_ranges[self.zero_based])]) + ("Class", self.class_treats[self.class_treatment][0])]) + + @classmethod + def migrate_settings(cls, settings, version): + if version < 2: + Normalize = cls.Normalize + cont_treat = settings.pop("continuous_treatment", 0) + zero_based = settings.pop("zero_based", True) + if cont_treat == 1: + if zero_based: + settings["continuous_treatment"] = Normalize.Normalize01 + else: + settings["continuous_treatment"] = Normalize.Normalize11 + elif cont_treat == 2: + settings["continuous_treatment"] = Normalize.Standardize class WeightedIndicator(Indicator): @@ -179,56 +179,33 @@ def transform(self, c): return t -class WeightedIndicator1(Indicator1): - def __init__(self, variable, value, weight=1.0): - super().__init__(variable, value) - self.weight = weight - - def transform(self, c): - t = super().transform(c) * self.weight - if self.weight != 1.0: - t *= self.weight - return t - - -def make_indicator_var(source, value_ind, weight=None, zero_based=True): - if zero_based and weight is None: +def make_indicator_var(source, value_ind, weight=None): + if weight is None: indicator = Indicator(source, value=value_ind) - elif zero_based: - indicator = WeightedIndicator(source, value=value_ind, weight=weight) - elif weight is None: - indicator = Indicator1(source, value=value_ind) else: - indicator = WeightedIndicator1(source, value=value_ind, weight=weight) + indicator = WeightedIndicator(source, value=value_ind, weight=weight) return Orange.data.ContinuousVariable( "{}={}".format(source.name, source.values[value_ind]), compute_value=indicator ) -def dummy_coding(var, base_value=0, zero_based=True): +def dummy_coding(var, base_value=0): N = len(var.values) - return [make_indicator_var(var, i, zero_based=zero_based) + return [make_indicator_var(var, i) for i in range(N) if i != base_value] -def one_hot_coding(var, zero_based=True): +def one_hot_coding(var): N = len(var.values) - return [make_indicator_var(var, i, zero_based=zero_based) - for i in range(N)] + return [make_indicator_var(var, i) for i in range(N)] -def continuize_domain(data_or_domain, +def continuize_domain(data, multinomial_treatment=Continuize.Indicators, continuous_treatment=Continuize.Leave, - class_treatment=Continuize.Leave, - zero_based=True): - - if isinstance(data_or_domain, Orange.data.Domain): - data, domain = None, data_or_domain - else: - data, domain = data_or_domain, data_or_domain.domain - + class_treatment=Continuize.Leave): + domain = data.domain def needs_dist(var, mtreat, ctreat): "Does the `var` need a distribution given specified flags" if var.is_discrete: @@ -258,14 +235,11 @@ def needs_dist(var, mtreat, ctreat): dist_iter = iter(dist) newattrs = [continuize_var(var, next(dist_iter) if needs_dist else None, - multinomial_treatment, continuous_treatment, - zero_based) + multinomial_treatment, continuous_treatment) for var, needs_dist in zip(domain.attributes, attr_needs_dist)] - newclass = [continuize_var(var, next(dist_iter) if needs_dist else None, - class_treatment, Continuize.Remove, - zero_based) + class_treatment, Continuize.Remove) for var, needs_dist in zip(domain.class_vars, cls_needs_dist)] newattrs = reduce(list.__iadd__, newattrs, []) @@ -276,16 +250,16 @@ def needs_dist(var, mtreat, ctreat): def continuize_var(var, data_or_dist=None, multinomial_treatment=Continuize.Indicators, - continuous_treatment=Continuize.Leave, - zero_based=True): - + continuous_treatment=Continuize.Leave): def continuize_continuous(): - if continuous_treatment == Normalize.NormalizeBySpan: - return [normalize_by_span(var, data_or_dist, zero_based)] - elif continuous_treatment == Normalize.NormalizeBySD: - return [normalize_by_sd(var, data_or_dist)] - else: + dist = _ensure_dist(var, data_or_dist) + treatments = [lambda var, _: var, + normalize_by_sd, center_to_mean, divide_by_sd, + normalize_to_11, normalize_to_01] + if dist.shape[1] == 0: return [var] + new_var = treatments[continuous_treatment](var, dist) + return [new_var] def continuize_discrete(): if len(var.values) > 2 and \ @@ -299,16 +273,16 @@ def continuize_discrete(): elif multinomial_treatment == Continuize.AsOrdinal: return [ordinal_to_continuous(var)] elif multinomial_treatment == Continuize.AsNormalizedOrdinal: - return [ordinal_to_norm_continuous(var, zero_based)] + return [ordinal_to_norm_continuous(var)] elif multinomial_treatment == Continuize.Indicators: - return one_hot_coding(var, zero_based) + return one_hot_coding(var) elif multinomial_treatment in ( Continuize.FirstAsBase, Continuize.RemoveMultinomial): - return dummy_coding(var, zero_based=zero_based) + return dummy_coding(var) elif multinomial_treatment == Continuize.FrequentAsBase: dist = _ensure_dist(var, data_or_dist) modus = dist.modus() - return dummy_coding(var, base_value=modus, zero_based=zero_based) + return dummy_coding(var, base_value=modus) elif multinomial_treatment == Continuize.Leave: return [var] raise ValueError("Invalid value of `multinomial_treatment`") @@ -345,68 +319,67 @@ def ordinal_to_continuous(var): compute_value=Identity(var)) -def ordinal_to_norm_continuous(var, zero_based=True): +def ordinal_to_norm_continuous(var): n_values = len(var.values) - if zero_based: - return normalized_var(var, 0, 1 / (n_values - 1)) - else: - return normalized_var(var, (n_values - 1) / 2, 2 / (n_values - 1)) + return normalized_var(var, 0, 1 / (n_values - 1)) -def normalize_by_span(var, data_or_dist, zero_based=True): - dist = _ensure_dist(var, data_or_dist) - if dist.shape[1] > 0: - v_max, v_min = dist.max(), dist.min() - else: - v_max, v_min = 0, 0 +def normalize_by_sd(var, dist): + mean, sd = dist.mean(), dist.standard_deviation() + sd = sd if sd > 1e-10 else 1 + return normalized_var(var, mean, 1 / sd) + + +def center_to_mean(var, dist): + return normalized_var(var, dist.mean(), 1) + + +def divide_by_sd(var, dist): + sd = dist.standard_deviation() + sd = sd if sd > 1e-10 else 1 + return normalized_var(var, 0, 1 / sd) + + +def normalize_to_11(var, dist): + return normalize_by_span(var, dist, False) + + +def normalize_to_01(var, dist): + return normalize_by_span(var, dist, True) + + +def normalize_by_span(var, dist, zero_based=True): + v_max, v_min = dist.max(), dist.min() span = (v_max - v_min) if span < 1e-15: span = 1 - if zero_based: return normalized_var(var, v_min, 1 / span) else: return normalized_var(var, (v_min + v_max) / 2, 2 / span) -def normalize_by_sd(var, data_or_dist): - dist = _ensure_dist(var, data_or_dist) - if dist.shape[1] > 0: - mean, sd = dist.mean(), dist.standard_deviation() - else: - mean, sd = 0, 1 - sd = sd if sd > 1e-10 else 1 - return normalized_var(var, mean, 1 / sd) - - class DomainContinuizer(Reprable): - def __init__(self, zero_based=True, + def __init__(self, multinomial_treatment=Continuize.Indicators, continuous_treatment=Continuize.Leave, class_treatment=Continuize.Leave): - self.zero_based = zero_based self.multinomial_treatment = multinomial_treatment self.continuous_treatment = continuous_treatment self.class_treatment = class_treatment def __call__(self, data): treat = self.multinomial_treatment - if isinstance(data, Orange.data.Domain): - domain, data = data, None - else: - domain = data.domain - + domain = data.domain if (treat == Continuize.ReportError and any(var.is_discrete and len(var.values) > 2 for var in domain)): raise ValueError("Domain has multinomial attributes") newdomain = continuize_domain( - data or domain, + data, self.multinomial_treatment, self.continuous_treatment, - self.class_treatment, - self.zero_based - ) + self.class_treatment) return newdomain diff --git a/Orange/widgets/data/tests/test_owcontinuize.py b/Orange/widgets/data/tests/test_owcontinuize.py index dfbe71abdcc..3b6ac974c01 100644 --- a/Orange/widgets/data/tests/test_owcontinuize.py +++ b/Orange/widgets/data/tests/test_owcontinuize.py @@ -5,7 +5,7 @@ import numpy as np -from Orange.data import Table, DiscreteVariable +from Orange.data import Table, DiscreteVariable, ContinuousVariable, Domain from Orange.preprocess import transformation from Orange.widgets.data import owcontinuize from Orange.widgets.data.owcontinuize import OWContinuize @@ -92,7 +92,6 @@ def test_one_column_nan_values_normalize_sd(self): self.send_signal(self.widget.Inputs.data, table) self.widget.unconditional_commit() - def test_one_column_nan_values_normalize_span(self): """ No crash on a column with NaN values and with selected option @@ -112,14 +111,14 @@ def test_one_column_nan_values_normalize_span(self): def test_disable_normalize_sparse(self): def assert_enabled(enabled): - buttons[BySpan].click() - buttons[BySD].click() - self.assertTrue(buttons[Leave].isEnabled()) - self.assertEqual(buttons[BySpan].isEnabled(), enabled) - self.assertEqual(buttons[BySD].isEnabled(), enabled) + for button, (method, supports_sparse) in \ + zip(buttons, w.continuous_treats): + self.assertEqual(button.isEnabled(), enabled or supports_sparse, + msg=f"Error in {method}") + buttons[w.Normalize.Leave].click() + buttons[w.Normalize.Standardize].click() w = self.widget - Leave, BySpan, BySD = w.Leave, w.NormalizeBySpan, w.NormalizeBySD buttons = w.controls.continuous_treatment.buttons iris = Table("iris") sparse_iris = iris.to_sparse() @@ -127,28 +126,101 @@ def assert_enabled(enabled): # input dense self.send_signal(w.Inputs.data, iris) assert_enabled(True) - self.assertEqual(w.continuous_treatment, BySD) + self.assertEqual(w.continuous_treatment, w.Normalize.Standardize) # input sparse self.send_signal(w.Inputs.data, sparse_iris) + self.assertEqual(w.continuous_treatment, w.Normalize.Scale) assert_enabled(False) - self.assertEqual(w.continuous_treatment, Leave) - - self.widget.continuous_treatment = BySpan - self.assertRaises(ValueError, w.commit) + self.assertEqual(w.continuous_treatment, w.Normalize.Leave) # remove data self.send_signal(w.Inputs.data, None) assert_enabled(True) # input sparse + buttons[w.Normalize.Normalize11].click() self.send_signal(w.Inputs.data, sparse_iris) + self.assertEqual(w.continuous_treatment, w.Normalize.Leave) assert_enabled(False) # input dense self.send_signal(w.Inputs.data, iris) assert_enabled(True) + def test_migrate_settings_to_v2(self): + Normalize = OWContinuize.Normalize + + widget = self.create_widget( + OWContinuize, + stored_settings=dict(continuous_treatment=0)) + self.assertEqual(widget.continuous_treatment, Normalize.Leave) + + widget = self.create_widget( + OWContinuize, + stored_settings=dict(continuous_treatment=1, zero_based=True)) + self.assertEqual(widget.continuous_treatment, Normalize.Normalize01) + + widget = self.create_widget( + OWContinuize, + stored_settings=dict(continuous_treatment=1, zero_based=False)) + self.assertEqual(widget.continuous_treatment, Normalize.Normalize11) + + widget = self.create_widget( + OWContinuize, + stored_settings=dict(continuous_treatment=2)) + self.assertEqual(widget.continuous_treatment, Normalize.Standardize) + + def test_normalizations(self): + buttons = self.widget.controls.continuous_treatment.buttons + Normalize = self.widget.Normalize + + domain = Domain([ContinuousVariable(name) for name in "xyz"]) + col0 = np.arange(0, 10, 2).reshape(5, 1) + col1 = np.ones((5, 1)) + col2 = np.arange(-2, 3).reshape(5, 1) + means = np.array([4, 1, 0]) + sds = np.sqrt(np.array([16 + 4 + 0 + 4 + 16, 5, 4 + 1 + 0 + 1 + 4]) / 5) + + x = np.hstack((col0, col1, col2)) + data = Table.from_numpy(domain, x) + self.send_signal(OWContinuize.Inputs.data, data) + + buttons[Normalize.Leave].click() + out = self.get_output(self.widget.Outputs.data) + np.testing.assert_equal(out.X, x) + + buttons[Normalize.Standardize].click() + out = self.get_output(self.widget.Outputs.data) + np.testing.assert_almost_equal(out.X, (x - means) / sds) + + buttons[Normalize.Center].click() + out = self.get_output(self.widget.Outputs.data) + np.testing.assert_almost_equal(out.X, x - means) + + buttons[Normalize.Scale].click() + out = self.get_output(self.widget.Outputs.data) + np.testing.assert_almost_equal(out.X, x / sds) + + buttons[Normalize.Normalize01].click() + out = self.get_output(self.widget.Outputs.data) + col = (np.arange(5) / 4).reshape(5, 1) + np.testing.assert_almost_equal( + out.X, + np.hstack((col, np.zeros((5, 1)), col)) + ) + + buttons[Normalize.Normalize11].click() + out = self.get_output(self.widget.Outputs.data) + col = (np.arange(5) / 2).reshape(5, 1) - 1 + np.testing.assert_almost_equal( + out.X, + np.hstack((col, np.zeros((5, 1)), col)) + ) + + def test_send_report(self): + self.widget.send_report() + class TestOWContinuizeUtils(unittest.TestCase): def test_dummy_coding_zero_based(self): @@ -166,18 +238,6 @@ def test_dummy_coding_zero_based(self): self.assertEqual(varc.compute_value.value, 2) self.assertIs(varc.compute_value.variable, var) - varb, varc = owcontinuize.dummy_coding(var, zero_based=False) - - self.assertEqual(varb.name, "foo=b") - self.assertIsInstance(varb.compute_value, transformation.Indicator1) - self.assertEqual(varb.compute_value.value, 1) - self.assertIs(varb.compute_value.variable, var) - - self.assertEqual(varc.name, "foo=c") - self.assertIsInstance(varc.compute_value, transformation.Indicator1) - self.assertEqual(varc.compute_value.value, 2) - self.assertIs(varb.compute_value.variable, var) - def test_dummy_coding_base_value(self): var = DiscreteVariable("foo", values=list("abc")) @@ -202,20 +262,13 @@ def test_dummy_coding_base_value(self): def test_one_hot_coding(self): var = DiscreteVariable("foo", values=list("abc")) - vars = owcontinuize.one_hot_coding(var) - for i, (c, nvar) in enumerate(zip("abc", vars)): + new_vars = owcontinuize.one_hot_coding(var) + for i, (c, nvar) in enumerate(zip("abc", new_vars)): self.assertEqual(nvar.name, f"foo={c}") self.assertIsInstance(nvar.compute_value, transformation.Indicator) self.assertEqual(nvar.compute_value.value, i) self.assertIs(nvar.compute_value.variable, var) - vars = owcontinuize.one_hot_coding(var, zero_based=False) - for i, (c, nvar) in enumerate(zip("abc", vars)): - self.assertEqual(nvar.name, f"foo={c}") - self.assertIsInstance(nvar.compute_value, transformation.Indicator1) - self.assertEqual(nvar.compute_value.value, i) - self.assertIs(nvar.compute_value.variable, var) - if __name__ == "__main__": unittest.main()