Skip to content

Commit

Permalink
OwPCA: Remove SVD & add normalization support
Browse files Browse the repository at this point in the history
  • Loading branch information
pavlin-policar committed Feb 15, 2019
1 parent fe28eaf commit c86b4cb
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 82 deletions.
63 changes: 18 additions & 45 deletions Orange/widgets/unsupervised/owpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@

from Orange.data import Table, Domain, StringVariable, ContinuousVariable
from Orange.data.sql.table import SqlTable, AUTO_DL_LIMIT
from Orange.preprocess import Normalize
from Orange.preprocess.preprocess import Preprocess, ApplyDomain
from Orange.projection import PCA, TruncatedSVD
from Orange.preprocess import preprocess
from Orange.projection import PCA
from Orange.widgets import widget, gui, settings
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.widget import Input, Output
Expand All @@ -26,11 +25,6 @@
# Maximum number of PCA components that we can set in the widget
MAX_COMPONENTS = 100

DECOMPOSITIONS = [
PCA,
TruncatedSVD
]


class OWPCA(widget.OWWidget):
name = "PCA"
Expand All @@ -46,7 +40,7 @@ class Outputs:
transformed_data = Output("Transformed data", Table)
components = Output("Components", Table)
pca = Output("PCA", PCA, dynamic=False)
preprocessor = Output("Preprocessor", Preprocess)
preprocessor = Output("Preprocessor", preprocess.Preprocess)

settingsHandler = settings.DomainContextHandler()

Expand All @@ -57,7 +51,6 @@ class Outputs:
auto_update = settings.Setting(True)
auto_commit = settings.Setting(True)
normalize = settings.ContextSetting(True)
decomposition_idx = settings.ContextSetting(0)
maxp = settings.Setting(20)
axis_labels = settings.Setting(10)

Expand All @@ -71,7 +64,6 @@ class Warning(widget.OWWidget.Warning):
class Error(widget.OWWidget.Error):
no_features = widget.Msg("At least 1 feature is required")
no_instances = widget.Msg("At least 1 data instance is required")
sparse_data = widget.Msg("Sparse data is not supported")

def __init__(self):
super().__init__()
Expand Down Expand Up @@ -134,13 +126,6 @@ def __init__(self):

self.sampling_box.setVisible(remotely)

# Decomposition
self.decomposition_box = gui.radioButtons(
self.controlArea, self,
"decomposition_idx", [d.name for d in DECOMPOSITIONS],
box="Decomposition", callback=self._update_decomposition
)

# Options
self.options_box = gui.vBox(self.controlArea, "Options")
self.normalize_box = gui.checkBox(
Expand Down Expand Up @@ -183,23 +168,6 @@ def update_model(self):
else:
self.__timer.stop()

def update_buttons(self, sparse_data=False):
if sparse_data:
self.normalize = False

buttons = self.decomposition_box.buttons
for cls, button in zip(DECOMPOSITIONS, buttons):
button.setDisabled(sparse_data and not cls.supports_sparse)

if not buttons[self.decomposition_idx].isEnabled():
# Set decomposition index to first sparse-enabled decomposition
for i, cls in enumerate(DECOMPOSITIONS):
if cls.supports_sparse:
self.decomposition_idx = i
break

self._init_projector()

def start(self):
if 'Abort' in self.start_button.text():
self.rpca.abort()
Expand Down Expand Up @@ -248,9 +216,7 @@ def set_data(self, data):
return

self.openContext(data)
sparse_data = data is not None and data.is_sparse()
self.normalize_box.setDisabled(sparse_data)
self.update_buttons(sparse_data=sparse_data)
self._init_projector()

self.data = data
self.fit()
Expand All @@ -260,9 +226,15 @@ def fit(self):
self.Warning.trivial_components.clear()
if self.data is None:
return

data = self.data
self._pca_projector.preprocessors = \
self._pca_preprocessors + ([Normalize()] if self.normalize else [])

if self.normalize:
self._pca_projector.preprocessors = \
self._pca_preprocessors + [preprocess.Normalize(center=False)]
else:
self._pca_projector.preprocessors = self._pca_preprocessors

if not isinstance(data, SqlTable):
pca = self._pca_projector(data)
variance_ratio = pca.explained_variance_ratio_
Expand Down Expand Up @@ -419,10 +391,9 @@ def _update_normalize(self):
self._invalidate_selection()

def _init_projector(self):
cls = DECOMPOSITIONS[self.decomposition_idx]
self._pca_projector = cls(n_components=MAX_COMPONENTS)
self._pca_projector = PCA(n_components=MAX_COMPONENTS)
self._pca_projector.component = self.ncomponents
self._pca_preprocessors = cls.preprocessors
self._pca_preprocessors = PCA.preprocessors

def _update_decomposition(self):
self._init_projector()
Expand Down Expand Up @@ -483,7 +454,7 @@ def commit(self):
metas=metas)
components.name = 'components'

pp = ApplyDomain(domain, "PCA")
pp = preprocess.ApplyDomain(domain, "PCA")

self._pca_projector.component = self.ncomponents
self.Outputs.transformed_data.send(transformed)
Expand All @@ -495,7 +466,6 @@ def send_report(self):
if self.data is None:
return
self.report_items((
("Decomposition", DECOMPOSITIONS[self.decomposition_idx].name),
("Normalize data", str(self.normalize)),
("Selected components", self.ncomponents),
("Explained variance", "{:.3f} %".format(self.variance_covered))
Expand All @@ -517,6 +487,9 @@ def migrate_settings(cls, settings, version):
if settings.get("ncomponents", 0) > MAX_COMPONENTS:
settings["ncomponents"] = MAX_COMPONENTS

# Remove old `decomposition_idx` when SVD was still included
settings.pop("decomposition_idx", None)


if __name__ == "__main__": # pragma: no cover
WidgetPreview(OWPCA).run(Table("housing"))
140 changes: 103 additions & 37 deletions Orange/widgets/unsupervised/tests/test_owpca.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,31 @@
# Test methods with long descriptive names can omit docstrings
# pylint: disable=missing-docstring
from unittest.mock import patch

import numpy as np
import scipy.sparse as sp

from Orange.data import Table, Domain, ContinuousVariable, TimeVariable
from Orange.preprocess.preprocess import Preprocess
from Orange.preprocess import preprocess
from Orange.preprocess.preprocess import Preprocess, Normalize
from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.unsupervised.owpca import OWPCA, DECOMPOSITIONS
from Orange.widgets.tests.utils import table_dense_sparse
from Orange.widgets.unsupervised.owpca import OWPCA
from sklearn.utils import check_random_state
from sklearn.utils.extmath import svd_flip


class TestOWPCA(WidgetTest):
def setUp(self):
self.widget = self.create_widget(OWPCA) # type: OWPCA
self.iris = Table("iris") # type: Table

def test_set_variance100(self):
iris = Table("iris")[:5]
self.widget.set_data(iris)
self.widget.set_data(self.iris)
self.widget.variance_covered = 100
self.widget._update_selection_variance_spin()

def test_constant_data(self):
data = Table("iris")[::5]
data = self.iris[::5]
data.X[:, :] = 1.0
# Ignore the warning: the test checks whether the widget shows
# Warning.trivial_components when this happens
Expand All @@ -32,12 +37,11 @@ def test_constant_data(self):

def test_empty_data(self):
""" Check widget for dataset with no rows and for dataset with no attributes """
data = Table("iris")
self.send_signal(self.widget.Inputs.data, data[:0])
self.send_signal(self.widget.Inputs.data, self.iris[:0])
self.assertTrue(self.widget.Error.no_instances.is_shown())

domain = Domain([], None, data.domain.variables)
new_data = Table.from_table(domain, data)
domain = Domain([], None, self.iris.domain.variables)
new_data = Table.from_table(domain, self.iris)
self.send_signal(self.widget.Inputs.data, new_data)
self.assertTrue(self.widget.Error.no_features.is_shown())
self.assertFalse(self.widget.Error.no_instances.is_shown())
Expand Down Expand Up @@ -74,8 +78,7 @@ def test_migrate_settings_changes_variance_covered_to_int(self):
self.assertEqual(settings["variance_covered"], 100)

def test_variance_shown(self):
data = Table("iris")
self.send_signal(self.widget.Inputs.data, data)
self.send_signal(self.widget.Inputs.data, self.iris)
self.widget.maxp = 2
self.widget._setup_plot()
var2 = self.widget.variance_covered
Expand All @@ -85,22 +88,27 @@ def test_variance_shown(self):
self.assertGreater(var3, var2)

def test_sparse_data(self):
data = Table("iris")
data.X = sp.csr_matrix(data.X)
self.widget.set_data(data)
decomposition = DECOMPOSITIONS[self.widget.decomposition_idx]
self.assertTrue(decomposition.supports_sparse)
self.assertFalse(self.widget.normalize_box.isEnabled())
"""Check that PCA returns the same results for both dense and sparse data."""
dense_data, sparse_data = self.iris, self.iris.to_sparse()

buttons = self.widget.decomposition_box.group.box.buttons
for i, decomposition in enumerate(DECOMPOSITIONS):
if not decomposition.supports_sparse:
self.assertFalse(buttons[i].isEnabled())

data = Table("iris")
self.widget.set_data(data)
self.assertTrue(all([b.isEnabled() for b in buttons]))
self.assertTrue(self.widget.normalize_box.isEnabled())
def _compute_projection(data):
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
result = self.get_output(self.widget.Outputs.transformed_data)
self.send_signal(self.widget.Inputs.data, None)
return result

# Disable normalization
self.widget.controls.normalize.setChecked(False)
dense_pca = _compute_projection(dense_data)
sparse_pca = _compute_projection(sparse_data)
np.testing.assert_almost_equal(dense_pca.X, sparse_pca.X)

# Enable normalization
self.widget.controls.normalize.setChecked(True)
dense_pca = _compute_projection(dense_data)
sparse_pca = _compute_projection(sparse_data)
np.testing.assert_almost_equal(dense_pca.X, sparse_pca.X)

def test_all_components_continuous(self):
data = Table("banking-crises.tab")
Expand All @@ -117,16 +125,75 @@ def test_all_components_continuous(self):
for a in components.domain.attributes),
"Some variables aren't of type ContinuousVariable")

def test_normalization(self):
data = Table("iris.tab")
@table_dense_sparse
def test_normalize_data(self, prepare_table):
"""Check that normalization is called at the proper times."""
data = prepare_table(self.iris)

# Enable checkbox
self.widget.controls.normalize.setChecked(True)
self.assertTrue(self.widget.controls.normalize.isChecked())
with patch.object(preprocess, "Normalize", wraps=Normalize) as normalize:
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
self.assertTrue(self.widget.controls.normalize.isEnabled())
normalize.assert_called_once()

# Disable checkbox
self.widget.controls.normalize.setChecked(False)
self.assertFalse(self.widget.controls.normalize.isChecked())
with patch.object(preprocess, "Normalize", wraps=Normalize) as normalize:
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
self.assertTrue(self.widget.controls.normalize.isEnabled())
normalize.assert_not_called()

@table_dense_sparse
def test_normalization_variance(self, prepare_table):
data = prepare_table(self.iris)
self.widget.ncomponents = 2

# Enable normalization
self.widget.controls.normalize.setChecked(True)
self.assertTrue(self.widget.normalize)
self.widget.set_data(data)
varnorm = self.widget.variance_covered
self.widget.controls.normalize.toggle()
varnonnorm = self.widget.variance_covered
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
variance_normalized = self.widget.variance_covered

# Disable normalization
self.widget.controls.normalize.setChecked(False)
self.assertFalse(self.widget.normalize)
self.wait_until_stop_blocking()
variance_unnormalized = self.widget.variance_covered

# normalized data will have lower covered variance
self.assertLess(varnorm, varnonnorm)
self.assertLess(variance_normalized, variance_unnormalized)

@table_dense_sparse
def test_normalized_gives_correct_result(self, prepare_table):
"""Make sure that normalization through widget gives correct result."""
# Randomly set some values to zero
random_state = check_random_state(42)
mask = random_state.beta(1, 2, size=self.iris.X.shape) > 0.5
self.iris.X[mask] = 0

data = prepare_table(self.iris)

# Enable normalization and run data through widget
self.widget.controls.normalize.setChecked(True)
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
widget_result = self.get_output(self.widget.Outputs.transformed_data)

# Compute the correct embedding
x = self.iris.X
x = (x - x.mean(0)) / x.std(0)
U, S, Va = np.linalg.svd(x)
U, S, Va = U[:, :2], S[:2], Va[:2]
U, Va = svd_flip(U, Va)
pca_embedding = U * S

np.testing.assert_almost_equal(widget_result.X, pca_embedding)

def test_do_not_mask_features(self):
# the widget used to replace cached variables when creating the
Expand All @@ -137,11 +204,10 @@ def test_do_not_mask_features(self):
self.assertEqual(data.domain[0], ndata.domain[0])

def test_output_preprocessor(self):
data = Table("iris")
self.send_signal(self.widget.Inputs.data, data)
self.send_signal(self.widget.Inputs.data, self.iris)
pp = self.get_output(self.widget.Outputs.preprocessor)
self.assertIsInstance(pp, Preprocess)
transformed_data = pp(data[::10])
transformed_data = pp(self.iris[::10])
self.assertIsInstance(transformed_data, Table)
self.assertEqual(transformed_data.X.shape, (15, 2))
output = self.get_output(self.widget.Outputs.transformed_data)
Expand Down

0 comments on commit c86b4cb

Please sign in to comment.