Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] PCA: Remove SVD & add normalization for sparse #3581

Merged
merged 2 commits into from
Feb 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 18 additions & 49 deletions Orange/widgets/unsupervised/owpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@

from Orange.data import Table, Domain, StringVariable, ContinuousVariable
from Orange.data.sql.table import SqlTable, AUTO_DL_LIMIT
from Orange.preprocess import Normalize
from Orange.preprocess.preprocess import Preprocess, ApplyDomain
from Orange.projection import PCA, TruncatedSVD
from Orange.preprocess import preprocess
from Orange.projection import PCA
from Orange.widgets import widget, gui, settings
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.widget import Input, Output
Expand All @@ -26,11 +25,6 @@
# Maximum number of PCA components that we can set in the widget
MAX_COMPONENTS = 100

DECOMPOSITIONS = [
PCA,
TruncatedSVD
]


class OWPCA(widget.OWWidget):
name = "PCA"
Expand All @@ -46,7 +40,7 @@ class Outputs:
transformed_data = Output("Transformed data", Table)
components = Output("Components", Table)
pca = Output("PCA", PCA, dynamic=False)
preprocessor = Output("Preprocessor", Preprocess)
preprocessor = Output("Preprocessor", preprocess.Preprocess)

settingsHandler = settings.DomainContextHandler()

Expand All @@ -57,7 +51,6 @@ class Outputs:
auto_update = settings.Setting(True)
auto_commit = settings.Setting(True)
normalize = settings.ContextSetting(True)
decomposition_idx = settings.ContextSetting(0)
maxp = settings.Setting(20)
axis_labels = settings.Setting(10)

Expand All @@ -71,7 +64,6 @@ class Warning(widget.OWWidget.Warning):
class Error(widget.OWWidget.Error):
no_features = widget.Msg("At least 1 feature is required")
no_instances = widget.Msg("At least 1 data instance is required")
sparse_data = widget.Msg("Sparse data is not supported")

def __init__(self):
super().__init__()
Expand Down Expand Up @@ -134,13 +126,6 @@ def __init__(self):

self.sampling_box.setVisible(remotely)

# Decomposition
self.decomposition_box = gui.radioButtons(
self.controlArea, self,
"decomposition_idx", [d.name for d in DECOMPOSITIONS],
box="Decomposition", callback=self._update_decomposition
)

# Options
self.options_box = gui.vBox(self.controlArea, "Options")
self.normalize_box = gui.checkBox(
Expand Down Expand Up @@ -183,23 +168,6 @@ def update_model(self):
else:
self.__timer.stop()

def update_buttons(self, sparse_data=False):
if sparse_data:
self.normalize = False

buttons = self.decomposition_box.buttons
for cls, button in zip(DECOMPOSITIONS, buttons):
button.setDisabled(sparse_data and not cls.supports_sparse)

if not buttons[self.decomposition_idx].isEnabled():
# Set decomposition index to first sparse-enabled decomposition
for i, cls in enumerate(DECOMPOSITIONS):
if cls.supports_sparse:
self.decomposition_idx = i
break

self._init_projector()

def start(self):
if 'Abort' in self.start_button.text():
self.rpca.abort()
Expand Down Expand Up @@ -248,9 +216,7 @@ def set_data(self, data):
return

self.openContext(data)
sparse_data = data is not None and data.is_sparse()
self.normalize_box.setDisabled(sparse_data)
self.update_buttons(sparse_data=sparse_data)
self._init_projector()

self.data = data
self.fit()
Expand All @@ -260,9 +226,15 @@ def fit(self):
self.Warning.trivial_components.clear()
if self.data is None:
return

data = self.data
self._pca_projector.preprocessors = \
self._pca_preprocessors + ([Normalize()] if self.normalize else [])

if self.normalize:
self._pca_projector.preprocessors = \
self._pca_preprocessors + [preprocess.Normalize(center=False)]
else:
self._pca_projector.preprocessors = self._pca_preprocessors

if not isinstance(data, SqlTable):
pca = self._pca_projector(data)
variance_ratio = pca.explained_variance_ratio_
Expand Down Expand Up @@ -419,14 +391,9 @@ def _update_normalize(self):
self._invalidate_selection()

def _init_projector(self):
cls = DECOMPOSITIONS[self.decomposition_idx]
self._pca_projector = cls(n_components=MAX_COMPONENTS)
self._pca_projector = PCA(n_components=MAX_COMPONENTS)
self._pca_projector.component = self.ncomponents
self._pca_preprocessors = cls.preprocessors

def _update_decomposition(self):
self._init_projector()
self._update_normalize()
self._pca_preprocessors = PCA.preprocessors

def _nselected_components(self):
"""Return the number of selected components."""
Expand Down Expand Up @@ -483,7 +450,7 @@ def commit(self):
metas=metas)
components.name = 'components'

pp = ApplyDomain(domain, "PCA")
pp = preprocess.ApplyDomain(domain, "PCA")

self._pca_projector.component = self.ncomponents
self.Outputs.transformed_data.send(transformed)
Expand All @@ -495,7 +462,6 @@ def send_report(self):
if self.data is None:
return
self.report_items((
("Decomposition", DECOMPOSITIONS[self.decomposition_idx].name),
("Normalize data", str(self.normalize)),
("Selected components", self.ncomponents),
("Explained variance", "{:.3f} %".format(self.variance_covered))
Expand All @@ -517,6 +483,9 @@ def migrate_settings(cls, settings, version):
if settings.get("ncomponents", 0) > MAX_COMPONENTS:
settings["ncomponents"] = MAX_COMPONENTS

# Remove old `decomposition_idx` when SVD was still included
settings.pop("decomposition_idx", None)


if __name__ == "__main__": # pragma: no cover
WidgetPreview(OWPCA).run(Table("housing"))
140 changes: 103 additions & 37 deletions Orange/widgets/unsupervised/tests/test_owpca.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,31 @@
# Test methods with long descriptive names can omit docstrings
# pylint: disable=missing-docstring
from unittest.mock import patch

import numpy as np
import scipy.sparse as sp

from Orange.data import Table, Domain, ContinuousVariable, TimeVariable
from Orange.preprocess.preprocess import Preprocess
from Orange.preprocess import preprocess
from Orange.preprocess.preprocess import Preprocess, Normalize
from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.unsupervised.owpca import OWPCA, DECOMPOSITIONS
from Orange.widgets.tests.utils import table_dense_sparse
from Orange.widgets.unsupervised.owpca import OWPCA
from sklearn.utils import check_random_state
from sklearn.utils.extmath import svd_flip


class TestOWPCA(WidgetTest):
def setUp(self):
self.widget = self.create_widget(OWPCA) # type: OWPCA
self.iris = Table("iris") # type: Table

def test_set_variance100(self):
iris = Table("iris")[:5]
self.widget.set_data(iris)
self.widget.set_data(self.iris)
self.widget.variance_covered = 100
self.widget._update_selection_variance_spin()

def test_constant_data(self):
data = Table("iris")[::5]
data = self.iris[::5]
data.X[:, :] = 1.0
# Ignore the warning: the test checks whether the widget shows
# Warning.trivial_components when this happens
Expand All @@ -32,12 +37,11 @@ def test_constant_data(self):

def test_empty_data(self):
""" Check widget for dataset with no rows and for dataset with no attributes """
data = Table("iris")
self.send_signal(self.widget.Inputs.data, data[:0])
self.send_signal(self.widget.Inputs.data, self.iris[:0])
self.assertTrue(self.widget.Error.no_instances.is_shown())

domain = Domain([], None, data.domain.variables)
new_data = Table.from_table(domain, data)
domain = Domain([], None, self.iris.domain.variables)
new_data = Table.from_table(domain, self.iris)
self.send_signal(self.widget.Inputs.data, new_data)
self.assertTrue(self.widget.Error.no_features.is_shown())
self.assertFalse(self.widget.Error.no_instances.is_shown())
Expand Down Expand Up @@ -74,8 +78,7 @@ def test_migrate_settings_changes_variance_covered_to_int(self):
self.assertEqual(settings["variance_covered"], 100)

def test_variance_shown(self):
data = Table("iris")
self.send_signal(self.widget.Inputs.data, data)
self.send_signal(self.widget.Inputs.data, self.iris)
self.widget.maxp = 2
self.widget._setup_plot()
var2 = self.widget.variance_covered
Expand All @@ -85,22 +88,27 @@ def test_variance_shown(self):
self.assertGreater(var3, var2)

def test_sparse_data(self):
data = Table("iris")
data.X = sp.csr_matrix(data.X)
self.widget.set_data(data)
decomposition = DECOMPOSITIONS[self.widget.decomposition_idx]
self.assertTrue(decomposition.supports_sparse)
self.assertFalse(self.widget.normalize_box.isEnabled())
"""Check that PCA returns the same results for both dense and sparse data."""
dense_data, sparse_data = self.iris, self.iris.to_sparse()

buttons = self.widget.decomposition_box.group.box.buttons
for i, decomposition in enumerate(DECOMPOSITIONS):
if not decomposition.supports_sparse:
self.assertFalse(buttons[i].isEnabled())

data = Table("iris")
self.widget.set_data(data)
self.assertTrue(all([b.isEnabled() for b in buttons]))
self.assertTrue(self.widget.normalize_box.isEnabled())
def _compute_projection(data):
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
result = self.get_output(self.widget.Outputs.transformed_data)
self.send_signal(self.widget.Inputs.data, None)
return result

# Disable normalization
self.widget.controls.normalize.setChecked(False)
dense_pca = _compute_projection(dense_data)
sparse_pca = _compute_projection(sparse_data)
np.testing.assert_almost_equal(dense_pca.X, sparse_pca.X)

# Enable normalization
self.widget.controls.normalize.setChecked(True)
dense_pca = _compute_projection(dense_data)
sparse_pca = _compute_projection(sparse_data)
np.testing.assert_almost_equal(dense_pca.X, sparse_pca.X)

def test_all_components_continuous(self):
data = Table("banking-crises.tab")
Expand All @@ -117,16 +125,75 @@ def test_all_components_continuous(self):
for a in components.domain.attributes),
"Some variables aren't of type ContinuousVariable")

def test_normalization(self):
data = Table("iris.tab")
@table_dense_sparse
def test_normalize_data(self, prepare_table):
"""Check that normalization is called at the proper times."""
data = prepare_table(self.iris)

# Enable checkbox
self.widget.controls.normalize.setChecked(True)
self.assertTrue(self.widget.controls.normalize.isChecked())
with patch.object(preprocess, "Normalize", wraps=Normalize) as normalize:
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
self.assertTrue(self.widget.controls.normalize.isEnabled())
normalize.assert_called_once()

# Disable checkbox
self.widget.controls.normalize.setChecked(False)
self.assertFalse(self.widget.controls.normalize.isChecked())
with patch.object(preprocess, "Normalize", wraps=Normalize) as normalize:
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
self.assertTrue(self.widget.controls.normalize.isEnabled())
normalize.assert_not_called()

@table_dense_sparse
def test_normalization_variance(self, prepare_table):
data = prepare_table(self.iris)
self.widget.ncomponents = 2

# Enable normalization
self.widget.controls.normalize.setChecked(True)
self.assertTrue(self.widget.normalize)
self.widget.set_data(data)
varnorm = self.widget.variance_covered
self.widget.controls.normalize.toggle()
varnonnorm = self.widget.variance_covered
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
variance_normalized = self.widget.variance_covered

# Disable normalization
self.widget.controls.normalize.setChecked(False)
self.assertFalse(self.widget.normalize)
self.wait_until_stop_blocking()
variance_unnormalized = self.widget.variance_covered

# normalized data will have lower covered variance
self.assertLess(varnorm, varnonnorm)
self.assertLess(variance_normalized, variance_unnormalized)

@table_dense_sparse
def test_normalized_gives_correct_result(self, prepare_table):
"""Make sure that normalization through widget gives correct result."""
# Randomly set some values to zero
random_state = check_random_state(42)
mask = random_state.beta(1, 2, size=self.iris.X.shape) > 0.5
self.iris.X[mask] = 0

data = prepare_table(self.iris)

# Enable normalization and run data through widget
self.widget.controls.normalize.setChecked(True)
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
widget_result = self.get_output(self.widget.Outputs.transformed_data)

# Compute the correct embedding
x = self.iris.X
x = (x - x.mean(0)) / x.std(0)
U, S, Va = np.linalg.svd(x)
U, S, Va = U[:, :2], S[:2], Va[:2]
U, Va = svd_flip(U, Va)
pca_embedding = U * S

np.testing.assert_almost_equal(widget_result.X, pca_embedding)

def test_do_not_mask_features(self):
# the widget used to replace cached variables when creating the
Expand All @@ -137,11 +204,10 @@ def test_do_not_mask_features(self):
self.assertEqual(data.domain[0], ndata.domain[0])

def test_output_preprocessor(self):
data = Table("iris")
self.send_signal(self.widget.Inputs.data, data)
self.send_signal(self.widget.Inputs.data, self.iris)
pp = self.get_output(self.widget.Outputs.preprocessor)
self.assertIsInstance(pp, Preprocess)
transformed_data = pp(data[::10])
transformed_data = pp(self.iris[::10])
self.assertIsInstance(transformed_data, Table)
self.assertEqual(transformed_data.X.shape, (15, 2))
output = self.get_output(self.widget.Outputs.transformed_data)
Expand Down