diff --git a/Orange/classification/outlier_detection.py b/Orange/classification/outlier_detection.py index 6892a04d112..2ddb9366b9f 100644 --- a/Orange/classification/outlier_detection.py +++ b/Orange/classification/outlier_detection.py @@ -1,21 +1,111 @@ # pylint: disable=unused-argument +import numpy as np + +from Orange.data.table import DomainTransformationError +from Orange.data.util import get_unique_names from sklearn.covariance import EllipticEnvelope from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor +from sklearn.svm import OneClassSVM + from Orange.base import SklLearner, SklModel -from Orange.data import Table, Domain +from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable, \ + Variable +from Orange.preprocess import AdaptiveNormalize +from Orange.statistics.util import all_nan __all__ = ["LocalOutlierFactorLearner", "IsolationForestLearner", - "EllipticEnvelopeLearner"] - - -class _OutlierDetector(SklLearner): - def __call__(self, data: Table): - data = data.transform(Domain(data.domain.attributes)) - return super().__call__(data) + "EllipticEnvelopeLearner", "OneClassSVMLearner"] + + +class _OutlierModel(SklModel): + def __init__(self, skl_model): + super().__init__(skl_model) + self._cached_data = None + self.outlier_var = None + + def predict(self, X: np.ndarray) -> np.ndarray: + pred = self.skl_model.predict(X) + pred[pred == -1] = 0 + return pred[:, None] + + def __call__(self, data: Table) -> Table: + assert isinstance(data, Table) + assert self.outlier_var is not None + + domain = Domain(data.domain.attributes, data.domain.class_vars, + data.domain.metas + (self.outlier_var,)) + self._cached_data = self.data_to_model_domain(data) + metas = np.hstack((data.metas, self.predict(self._cached_data.X))) + return Table.from_numpy(domain, data.X, data.Y, metas) + + def data_to_model_domain(self, data: Table) -> Table: + if data.domain == self.domain: + return data + + if self.original_domain.attributes != data.domain.attributes \ + and data.X.size \ + and not all_nan(data.X): + new_data = data.transform(self.original_domain) + if all_nan(new_data.X): + raise DomainTransformationError( + "domain transformation produced no defined values") + return new_data.transform(self.domain) + return data.transform(self.domain) + + +class _OutlierLearner(SklLearner): + __returns__ = _OutlierModel + supports_multiclass = True + + def _fit_model(self, data: Table) -> _OutlierModel: + domain = data.domain + model = super()._fit_model(data.transform(Domain(domain.attributes))) + + transformer = _Transformer(model) + names = [v.name for v in domain.variables + domain.metas] + variable = DiscreteVariable( + get_unique_names(names, "Outlier"), + values=["Yes", "No"], + compute_value=transformer + ) + + transformer.variable = variable + model.outlier_var = variable + return model + + +class _Transformer: + def __init__(self, model: _OutlierModel): + self._model = model + self._variable = None + + @property + def variable(self) -> Variable: + return self._variable + + @variable.setter + def variable(self, var: Variable): + self._variable = var + + def __call__(self, data: Table) -> np.ndarray: + assert isinstance(self._variable, Variable) + return self._model(data).get_column_view(self._variable)[0] + + +class OneClassSVMLearner(_OutlierLearner): + name = "One class SVM" + __wraps__ = OneClassSVM + preprocessors = SklLearner.preprocessors + [AdaptiveNormalize()] + + def __init__(self, kernel='rbf', degree=3, gamma="auto", coef0=0.0, + tol=0.001, nu=0.5, shrinking=True, cache_size=200, + max_iter=-1, preprocessors=None): + super().__init__(preprocessors=preprocessors) + self.params = vars() -class LocalOutlierFactorLearner(_OutlierDetector): +class LocalOutlierFactorLearner(_OutlierLearner): __wraps__ = LocalOutlierFactor name = "Local Outlier Factor" @@ -27,7 +117,7 @@ def __init__(self, n_neighbors=20, algorithm="auto", leaf_size=30, self.params = vars() -class IsolationForestLearner(_OutlierDetector): +class IsolationForestLearner(_OutlierLearner): __wraps__ = IsolationForest name = "Isolation Forest" @@ -39,25 +129,34 @@ def __init__(self, n_estimators=100, max_samples='auto', self.params = vars() -class EllipticEnvelopeClassifier(SklModel): - def mahalanobis(self, observations): +class EllipticEnvelopeClassifier(_OutlierModel): + def __init__(self, skl_model): + super().__init__(skl_model) + self.mahal_var = None + + def mahalanobis(self, observations: np.ndarray) -> np.ndarray: """Computes squared Mahalanobis distances of given observations. Parameters ---------- - observations : ndarray (n_samples, n_features) or Orange Table + observations : ndarray (n_samples, n_features) Returns ------- - distances : ndarray (n_samples,) + distances : ndarray (n_samples, 1) Squared Mahalanobis distances given observations. """ - if isinstance(observations, Table): - observations = observations.X - return self.skl_model.mahalanobis(observations) + return self.skl_model.mahalanobis(observations)[:, None] + + def __call__(self, data: Table) -> Table: + pred = super().__call__(data) + domain = Domain(pred.domain.attributes, pred.domain.class_vars, + pred.domain.metas + (self.mahal_var,)) + metas = np.hstack((pred.metas, self.mahalanobis(self._cached_data.X))) + return Table.from_numpy(domain, pred.X, pred.Y, metas) -class EllipticEnvelopeLearner(_OutlierDetector): +class EllipticEnvelopeLearner(_OutlierLearner): __wraps__ = EllipticEnvelope __returns__ = EllipticEnvelopeClassifier name = "Covariance Estimator" @@ -68,6 +167,18 @@ def __init__(self, store_precision=True, assume_centered=False, super().__init__(preprocessors=preprocessors) self.params = vars() - def __call__(self, data: Table): - data = data.transform(Domain(data.domain.attributes)) - return super().__call__(data) + def _fit_model(self, data: Table) -> EllipticEnvelopeClassifier: + domain = data.domain + model = super()._fit_model(data.transform(Domain(domain.attributes))) + + transformer = _Transformer(model) + names = [v.name for v in domain.variables + domain.metas] + variable = ContinuousVariable( + get_unique_names(names, "Mahalanobis"), + compute_value=transformer + ) + + transformer.variable = variable + model.mahal_var = variable + return model + diff --git a/Orange/classification/svm.py b/Orange/classification/svm.py index a3626f6be54..adbb5080867 100644 --- a/Orange/classification/svm.py +++ b/Orange/classification/svm.py @@ -1,12 +1,9 @@ import sklearn.svm as skl_svm -from Orange.base import SklLearner as SklLearnerBase from Orange.classification import SklLearner, SklModel -from Orange.data import Domain from Orange.preprocess import AdaptiveNormalize -__all__ = ["SVMLearner", "LinearSVMLearner", "NuSVMLearner", - "OneClassSVMLearner"] +__all__ = ["SVMLearner", "LinearSVMLearner", "NuSVMLearner"] svm_pps = SklLearner.preprocessors + [AdaptiveNormalize()] @@ -62,28 +59,6 @@ def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma="auto", coef0=0.0, self.params = vars() -class OneClassSVMLearner(SklLearnerBase): - name = "One class SVM" - __wraps__ = skl_svm.OneClassSVM - preprocessors = svm_pps - - def __init__(self, kernel='rbf', degree=3, gamma="auto", coef0=0.0, - tol=0.001, nu=0.5, shrinking=True, cache_size=200, - max_iter=-1, preprocessors=None): - super().__init__(preprocessors=preprocessors) - self.params = vars() - - def __call__(self, data): - classless_data = data.transform(Domain(data.domain.attributes)) - return super().__call__(classless_data) - - def fit(self, X, Y=None, W=None): - clf = self.__wraps__(**self.params) - if W is not None: - return self.__returns__(clf.fit(X, W.reshape(-1))) - return self.__returns__(clf.fit(X)) - - if __name__ == '__main__': import Orange diff --git a/Orange/classification/tests/test_outlier_detection.py b/Orange/classification/tests/test_outlier_detection.py index 1ac0b0c4ada..272ed93438f 100644 --- a/Orange/classification/tests/test_outlier_detection.py +++ b/Orange/classification/tests/test_outlier_detection.py @@ -1,17 +1,106 @@ # Test methods with long descriptive names can omit docstrings # pylint: disable=missing-docstring - +import pickle +import tempfile import unittest import numpy as np -from Orange.data import Table, Domain, ContinuousVariable + from Orange.classification import EllipticEnvelopeLearner, \ - IsolationForestLearner, LocalOutlierFactorLearner + IsolationForestLearner, LocalOutlierFactorLearner, OneClassSVMLearner +from Orange.data import Table, Domain, ContinuousVariable + + +class _TestDetector(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.iris = Table("iris") + + def assert_domain_equal(self, domain1, domain2): + for var1, var2 in zip(domain1.variables + domain1.metas, + domain2.variables + domain2.metas): + self.assertEqual(type(var1), type(var2)) + self.assertEqual(var1.name, var2.name) + if var1.is_discrete: + self.assertEqual(var1.values, var2.values) + + def assert_table_equal(self, table1, table2): + if table1 is None or table2 is None: + self.assertIs(table1, table2) + return + self.assert_domain_equal(table1.domain, table2.domain) + np.testing.assert_array_equal(table1.X, table2.X) + np.testing.assert_array_equal(table1.Y, table2.Y) + np.testing.assert_array_equal(table1.metas, table2.metas) + + def assert_table_appended_outlier(self, table1, table2, offset=1): + np.testing.assert_array_equal(table1.X, table2.X) + np.testing.assert_array_equal(table1.Y, table2.Y) + np.testing.assert_array_equal(table1.metas, table2.metas[:, :-offset]) + metas = table2.metas[:, -offset] + self.assertEqual(sum(metas == 1) + sum(metas == 0), len(metas)) + dom = table2.domain + domain = Domain(dom.attributes, dom.class_vars, dom.metas[:-offset]) + self.assert_domain_equal(table1.domain, domain) + self.assertEqual(table2.domain.metas[-offset].name, "Outlier") + self.assertIsNotNone(table2.domain.metas[-offset].compute_value) + + +class TestOneClassSVMLearner(_TestDetector): + def test_OneClassSVM(self): + np.random.seed(42) + domain = Domain((ContinuousVariable("c1"), ContinuousVariable("c2"))) + X_in = 0.3 * np.random.randn(40, 2) + X_out = np.random.uniform(low=-4, high=4, size=(20, 2)) + X_all = Table(domain, np.r_[X_in + 2, X_in - 2, X_out]) + n_true_in = len(X_in) * 2 + n_true_out = len(X_out) + + nu = 0.2 + learner = OneClassSVMLearner(nu=nu) + cls = learner(X_all) + y_pred = cls(X_all) + n_pred_out_all = np.sum(y_pred.metas == 0) + n_pred_in_true_in = np.sum(y_pred.metas[:n_true_in] == 1) + n_pred_out_true_out = np.sum(y_pred.metas[- n_true_out:] == 0) + + self.assertLessEqual(n_pred_out_all, len(X_all) * nu) + self.assertLess(np.absolute(n_pred_out_all - n_true_out), 2) + self.assertLess(np.absolute(n_pred_in_true_in - n_true_in), 4) + self.assertLess(np.absolute(n_pred_out_true_out - n_true_out), 3) + + def test_OneClassSVM_ignores_y(self): + domain = Domain((ContinuousVariable("x1"), ContinuousVariable("x2")), + class_vars=(ContinuousVariable("y1"), ContinuousVariable("y2"))) + X = np.random.random((40, 2)) + Y = np.random.random((40, 2)) + table = Table(domain, X, Y) + classless_table = table.transform(Domain(table.domain.attributes)) + learner = OneClassSVMLearner() + classless_model = learner(classless_table) + model = learner(table) + pred1 = classless_model(classless_table) + pred2 = classless_model(table) + pred3 = model(classless_table) + pred4 = model(table) + np.testing.assert_array_equal(pred1.metas, pred2.metas) + np.testing.assert_array_equal(pred2.metas, pred3.metas) + np.testing.assert_array_equal(pred3.metas, pred4.metas) -class TestEllipticEnvelopeLearner(unittest.TestCase): + def test_transform(self): + detector = OneClassSVMLearner(nu=0.1) + detect = detector(self.iris) + pred = detect(self.iris) + self.assert_table_appended_outlier(self.iris, pred) + pred2 = self.iris.transform(pred.domain) + self.assert_table_equal(pred, pred2) + + +class TestEllipticEnvelopeLearner(_TestDetector): @classmethod def setUpClass(cls): + super().setUpClass() np.random.seed(42) domain = Domain((ContinuousVariable("c1"), ContinuousVariable("c2"))) cls.n_true_in, cls.n_true_out = 80, 20 @@ -25,11 +114,10 @@ def setUpClass(cls): def test_EllipticEnvelope(self): y_pred = self.model(self.X_all) - n_pred_out_all = np.sum(y_pred == -1) - n_pred_in_true_in = np.sum(y_pred[:self.n_true_in] == 1) - n_pred_out_true_o = np.sum(y_pred[- self.n_true_out:] == -1) + n_pred_out_all = np.sum(y_pred.metas == 0) + n_pred_in_true_in = np.sum(y_pred.metas[:self.n_true_in] == 1) + n_pred_out_true_o = np.sum(y_pred.metas[- self.n_true_out:] == 0) - self.assertTrue(all(np.absolute(y_pred) == 1)) self.assertGreaterEqual(len(self.X_all) * self.cont, n_pred_out_all) self.assertGreater(1, np.absolute(n_pred_out_all - self.n_true_out)) self.assertGreater(2, np.absolute(n_pred_in_true_in - self.n_true_in)) @@ -37,10 +125,11 @@ def test_EllipticEnvelope(self): def test_mahalanobis(self): n = len(self.X_all) - y_pred = self.model(self.X_all) - y_mahal = self.model.mahalanobis(self.X_all) + pred = self.model(self.X_all) + y_pred = pred[:, self.model.outlier_var].metas + y_mahal = pred[:, self.model.mahal_var].metas y_mahal, y_pred = zip(*sorted(zip(y_mahal, y_pred), reverse=True)) - self.assertTrue(all(i == -1 for i in y_pred[:int(self.cont * n)])) + self.assertTrue(all(i == 0 for i in y_pred[:int(self.cont * n)])) self.assertTrue(all(i == 1 for i in y_pred[int(self.cont * n):])) def test_EllipticEnvelope_ignores_y(self): @@ -58,27 +147,93 @@ def test_EllipticEnvelope_ignores_y(self): pred3 = model(classless_table) pred4 = model(table) - np.testing.assert_array_equal(pred1, pred2) - np.testing.assert_array_equal(pred2, pred3) - np.testing.assert_array_equal(pred3, pred4) + np.testing.assert_array_equal(pred1.metas, pred2.metas) + np.testing.assert_array_equal(pred2.metas, pred3.metas) + np.testing.assert_array_equal(pred3.metas, pred4.metas) + def test_transform(self): + detector = EllipticEnvelopeLearner() + detect = detector(self.iris) + pred = detect(self.iris) + self.assert_table_appended_outlier(self.iris, pred, offset=2) + self.assertEqual(pred.domain.metas[-1].name, "Mahalanobis") + self.assertIsNotNone(pred.domain.metas[-1].compute_value) + pred2 = self.iris.transform(pred.domain) + self.assert_table_equal(pred, pred2) -class TestOutlierDetection(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.iris = Table("iris") - def test_LocalOutlierFactorDetector(self): +class TestLocalOutlierFactorLearner(_TestDetector): + def test_LocalOutlierFactor(self): + detector = LocalOutlierFactorLearner(contamination=0.1) + detect = detector(self.iris) + pred = detect(self.iris) + self.assertEqual(len(np.where(pred.metas == 0)[0]), 14) + + def test_transform(self): detector = LocalOutlierFactorLearner(contamination=0.1) detect = detector(self.iris) - is_inlier = detect(self.iris) - self.assertEqual(len(np.where(is_inlier == -1)[0]), 14) + pred = detect(self.iris) + self.assert_table_appended_outlier(self.iris, pred) + pred2 = self.iris.transform(pred.domain) + self.assert_table_equal(pred, pred2) - def test_IsolationForestDetector(self): + +class TestIsolationForestLearner(_TestDetector): + def test_IsolationForest(self): + detector = IsolationForestLearner(contamination=0.1) + detect = detector(self.iris) + pred = detect(self.iris) + self.assertEqual(len(np.where(pred.metas == 0)[0]), 15) + + def test_transform(self): detector = IsolationForestLearner(contamination=0.1) detect = detector(self.iris) - is_inlier = detect(self.iris) - self.assertEqual(len(np.where(is_inlier == -1)[0]), 15) + pred = detect(self.iris) + self.assert_table_appended_outlier(self.iris, pred) + pred2 = self.iris.transform(pred.domain) + self.assert_table_equal(pred, pred2) + + +class TestOutlierModel(_TestDetector): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.detector = LocalOutlierFactorLearner(contamination=0.1) + + def test_unique_name(self): + domain = Domain((ContinuousVariable("Outlier"),)) + table = Table(domain, np.random.random((40, 1))) + detect = self.detector(table) + pred = detect(table) + self.assertEqual(pred.domain.metas[0].name, "Outlier (1)") + + def test_transform(self): + detect = self.detector(self.iris) + pred = detect(self.iris) + self.assert_table_appended_outlier(self.iris, pred) + pred2 = self.iris.transform(pred.domain) + self.assert_table_equal(pred, pred2) + + def test_transformer(self): + detect = self.detector(self.iris) + pred = detect(self.iris) + var = pred.domain.metas[0] + self.assertIs(var, var.compute_value.variable) + np.testing.assert_array_equal(pred[:, "Outlier"].metas.ravel(), + var.compute_value(self.iris)) + + def test_pickle_model(self): + detect = self.detector(self.iris) + f = tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) + pickle.dump(detect, f) + f.close() + + def test_pickle_prediction(self): + detect = self.detector(self.iris) + pred = detect(self.iris) + f = tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) + pickle.dump(pred, f) + f.close() if __name__ == "__main__": diff --git a/Orange/tests/test_svm.py b/Orange/tests/test_svm.py index 43d34757b90..63f459880f8 100644 --- a/Orange/tests/test_svm.py +++ b/Orange/tests/test_svm.py @@ -7,11 +7,10 @@ import numpy as np from sklearn.exceptions import ConvergenceWarning -from Orange.classification import (SVMLearner, LinearSVMLearner, - NuSVMLearner, OneClassSVMLearner) -from Orange.regression import (SVRLearner, NuSVRLearner) -from Orange.data import Table, Domain, ContinuousVariable +from Orange.classification import SVMLearner, LinearSVMLearner, NuSVMLearner +from Orange.data import Table from Orange.evaluation import CrossValidation, CA, RMSE +from Orange.regression import SVRLearner, NuSVRLearner from Orange.tests import test_filename @@ -72,44 +71,6 @@ def test_NuSVR(self): res = cv(data, [learn]) self.assertLess(RMSE(res)[0], 0.1) - def test_OneClassSVM(self): - np.random.seed(42) - domain = Domain((ContinuousVariable("c1"), ContinuousVariable("c2"))) - X_in = 0.3 * np.random.randn(40, 2) - X_out = np.random.uniform(low=-4, high=4, size=(20, 2)) - X_all = Table(domain, np.r_[X_in + 2, X_in - 2, X_out]) - n_true_in = len(X_in) * 2 - n_true_out = len(X_out) - nu = 0.2 - learner = OneClassSVMLearner(nu=nu) - cls = learner(X_all) - y_pred = cls(X_all) - n_pred_out_all = np.sum(y_pred == -1) - n_pred_in_true_in = np.sum(y_pred[:n_true_in] == 1) - n_pred_out_true_out = np.sum(y_pred[- n_true_out:] == -1) - - self.assertEqual(np.absolute(y_pred).all(), 1) - self.assertLessEqual(n_pred_out_all, len(X_all) * nu) - self.assertLess(np.absolute(n_pred_out_all - n_true_out), 2) - self.assertLess(np.absolute(n_pred_in_true_in - n_true_in), 4) - self.assertLess(np.absolute(n_pred_out_true_out - n_true_out), 3) - - def test_OneClassSVM_ignores_y(self): - domain = Domain((ContinuousVariable("x1"), ContinuousVariable("x2")), - class_vars=(ContinuousVariable("y1"), ContinuousVariable("y2"))) - X = np.random.random((40, 2)) - Y = np.random.random((40, 2)) - table = Table(domain, X, Y) - classless_table = table.transform(Domain(table.domain.attributes)) - learner = OneClassSVMLearner() - classless_model = learner(classless_table) - model = learner(table) - pred1 = classless_model(classless_table) - pred2 = classless_model(table) - pred3 = model(classless_table) - pred4 = model(table) - - np.testing.assert_array_equal(pred1, pred2) - np.testing.assert_array_equal(pred2, pred3) - np.testing.assert_array_equal(pred3, pred4) +if __name__ == "__main__": + unittest.main() diff --git a/Orange/widgets/data/owoutliers.py b/Orange/widgets/data/owoutliers.py index 99bc219aaae..6815d4c6bc9 100644 --- a/Orange/widgets/data/owoutliers.py +++ b/Orange/widgets/data/owoutliers.py @@ -7,11 +7,9 @@ from orangewidget.settings import SettingProvider -from Orange.base import Model from Orange.classification import OneClassSVMLearner, EllipticEnvelopeLearner,\ LocalOutlierFactorLearner, IsolationForestLearner -from Orange.data import Table, Domain, ContinuousVariable, DiscreteVariable -from Orange.data.util import get_unique_names +from Orange.data import Table from Orange.widgets import gui from Orange.widgets.settings import Setting from Orange.widgets.utils.sql import check_sql_input @@ -240,11 +238,26 @@ def enable_controls(self): self.method_combo.model().item(self.Covariance).setEnabled(False) self.Warning.disabled_cov() - def _get_outliers(self) -> Tuple[Table, Table, Table]: + def commit(self): + inliers, outliers, data = self.detect_outliers() + summary = len(inliers) if inliers else self.info.NoOutput + self.info.set_output_summary(summary) + self.Outputs.inliers.send(inliers) + self.Outputs.outliers.send(outliers) + self.Outputs.data.send(data) + + def detect_outliers(self) -> Tuple[Table, Table, Table]: + self.n_inliers = self.n_outliers = None self.Error.singular_cov.clear() self.Error.memory_error.clear() + if not self.data: + return None, None, None try: - y_pred, amended_data = self.detect_outliers() + learner_class = self.METHODS[self.outlier_method] + kwargs = self.current_editor.get_parameters() + learner = learner_class(**kwargs) + model = learner(self.data) + pred = model(self.data) except ValueError: self.Error.singular_cov() return None, None, None @@ -252,63 +265,12 @@ def _get_outliers(self) -> Tuple[Table, Table, Table]: self.Error.memory_error() return None, None, None else: - inliers_ind = np.where(y_pred == 1)[0] - outliers_ind = np.where(y_pred == -1)[0] - inliers = amended_data[inliers_ind] - outliers = amended_data[outliers_ind] - self.n_inliers = len(inliers) - self.n_outliers = len(outliers) - return inliers, outliers, self.annotated_data(amended_data, y_pred) - - def commit(self): - inliers = outliers = data = None - self.n_inliers = self.n_outliers = None - if self.data: - inliers, outliers, data = self._get_outliers() - - summary = len(inliers) if inliers else self.info.NoOutput - self.info.set_output_summary(summary) - self.Outputs.inliers.send(inliers) - self.Outputs.outliers.send(outliers) - self.Outputs.data.send(data) - - def detect_outliers(self) -> Tuple[np.ndarray, Table]: - learner_class = self.METHODS[self.outlier_method] - kwargs = self.current_editor.get_parameters() - learner = learner_class(**kwargs) - model = learner(self.data) - y_pred = model(self.data) - amended_data = self.amended_data(model) - return np.array(y_pred), amended_data - - def amended_data(self, model: Model) -> Table: - if self.outlier_method != self.Covariance: - return self.data - mahal = model.mahalanobis(self.data.X) - mahal = mahal.reshape(len(self.data), 1) - attrs = self.data.domain.attributes - classes = self.data.domain.class_vars - new_metas = list(self.data.domain.metas) + \ - [ContinuousVariable(name="Mahalanobis")] - new_domain = Domain(attrs, classes, new_metas) - amended_data = self.data.transform(new_domain) - amended_data.metas = np.hstack((self.data.metas, mahal)) - return amended_data - - @staticmethod - def annotated_data(data: Table, labels: np.ndarray) -> Table: - domain = data.domain - names = [v.name for v in domain.variables + domain.metas] - name = get_unique_names(names, "Outlier") - - outlier_var = DiscreteVariable(name, values=["Yes", "No"]) - metas = domain.metas + (outlier_var,) - domain = Domain(domain.attributes, domain.class_vars, metas) - data = data.transform(domain) - - labels[labels == -1] = 0 - data.metas[:, -1] = labels - return data + col = pred[:, model.outlier_var].metas + inliers_ind = np.where(col == 1)[0] + outliers_ind = np.where(col == 0)[0] + self.n_inliers = len(inliers_ind) + self.n_outliers = len(outliers_ind) + return self.data[inliers_ind], self.data[outliers_ind], pred def send_report(self): if self.n_outliers is None or self.n_inliers is None: diff --git a/Orange/widgets/data/tests/test_owoutliers.py b/Orange/widgets/data/tests/test_owoutliers.py index fa06dd6a976..2b351c2e06f 100644 --- a/Orange/widgets/data/tests/test_owoutliers.py +++ b/Orange/widgets/data/tests/test_owoutliers.py @@ -4,8 +4,6 @@ import unittest from unittest.mock import patch, Mock -import numpy as np - from Orange.data import Table from Orange.widgets.data.owoutliers import OWOutliers from Orange.widgets.tests.base import WidgetTest, simulate @@ -15,20 +13,54 @@ class TestOWOutliers(WidgetTest): def setUp(self): self.widget = self.create_widget(OWOutliers) self.iris = Table("iris") + self.heart_disease = Table("heart_disease") - def test_data(self): + def test_outputs(self): """Check widget's data and the output with data on the input""" self.send_signal(self.widget.Inputs.data, self.iris) - self.assertEqual(self.widget.data, self.iris) - self.assertEqual(len(self.get_output(self.widget.Outputs.inliers)), 135) - self.assertEqual(len(self.get_output(self.widget.Outputs.outliers)), 15) - self.assertEqual(len(self.get_output(self.widget.Outputs.data)), 150) + inliers = self.get_output(self.widget.Outputs.inliers) + outliers = self.get_output(self.widget.Outputs.outliers) + data = self.get_output(self.widget.Outputs.data) + self.assertEqual(len(inliers), 135) + self.assertEqual(len(outliers), 15) + self.assertEqual(len(data), 150) + self.assertEqual(len(inliers.domain.attributes), 4) + self.assertEqual(len(outliers.domain.attributes), 4) + self.assertEqual(len(data.domain.attributes), 4) + self.assertEqual(len(inliers.domain.class_vars), 1) + self.assertEqual(len(outliers.domain.class_vars), 1) + self.assertEqual(len(data.domain.class_vars), 1) + self.assertEqual(len(inliers.domain.metas), 0) + self.assertEqual(len(outliers.domain.metas), 0) + self.assertEqual(len(data.domain.metas), 1) + self.send_signal(self.widget.Inputs.data, None) - self.assertEqual(self.widget.data, None) self.assertIsNone(self.get_output(self.widget.Outputs.inliers)) self.assertIsNone(self.get_output(self.widget.Outputs.outliers)) self.assertIsNone(self.get_output(self.widget.Outputs.data)) + def test_output_empirical_covariance(self): + simulate.combobox_activate_index(self.widget.method_combo, + self.widget.Covariance) + self.send_signal(self.widget.Inputs.data, self.iris) + inliers = self.get_output(self.widget.Outputs.inliers) + outliers = self.get_output(self.widget.Outputs.outliers) + data = self.get_output(self.widget.Outputs.data) + self.assertEqual(len(inliers), 135) + self.assertEqual(len(outliers), 15) + self.assertEqual(len(data), 150) + self.assertEqual(len(inliers.domain.attributes), 4) + self.assertEqual(len(outliers.domain.attributes), 4) + self.assertEqual(len(data.domain.attributes), 4) + self.assertEqual(len(inliers.domain.class_vars), 1) + self.assertEqual(len(outliers.domain.class_vars), 1) + self.assertEqual(len(data.domain.class_vars), 1) + self.assertEqual(len(inliers.domain.metas), 0) + self.assertEqual(len(outliers.domain.metas), 0) + self.assertEqual(len(data.domain.metas), 2) + self.assertEqual([m.name for m in data.domain.metas], + ["Outlier", "Mahalanobis"]) + def test_methods(self): def callback(): self.widget.send_report() @@ -36,30 +68,37 @@ def callback(): self.assertIsNotNone(self.get_output(self.widget.Outputs.outliers)) self.assertIsNotNone(self.get_output(self.widget.Outputs.data)) - self.send_signal(self.widget.Inputs.data, self.iris) + self.widget.send_report() + self.send_signal(self.widget.Inputs.data, self.heart_disease) simulate.combobox_run_through_all(self.widget.method_combo, callback=callback) - def test_memory_error(self): + @patch("Orange.classification.outlier_detection._OutlierModel.predict") + def test_memory_error(self, mocked_predict: Mock): """ Handling memory error. GH-2374 """ - data = Table("iris")[::3] self.assertFalse(self.widget.Error.memory_error.is_shown()) - with unittest.mock.patch( - "Orange.widgets.data.owoutliers.OWOutliers.detect_outliers", - side_effect=MemoryError): - self.send_signal("Data", data) - self.assertTrue(self.widget.Error.memory_error.is_shown()) + mocked_predict.side_effect = MemoryError + self.send_signal(self.widget.Inputs.data, self.iris) + self.assertTrue(self.widget.Error.memory_error.is_shown()) + + @patch("Orange.classification.outlier_detection._OutlierModel.predict") + def test_singular_cov_error(self, mocked_predict: Mock): + self.assertFalse(self.widget.Error.singular_cov.is_shown()) + mocked_predict.side_effect = ValueError + self.send_signal(self.widget.Inputs.data, self.iris) + self.assertTrue(self.widget.Error.singular_cov.is_shown()) def test_nans(self): """Widget does not crash with nans""" - a = np.arange(20, dtype=float).reshape(4, 5) - a[0, 0] = np.nan - data = Table.from_numpy(None, a) - self.send_signal(self.widget.Inputs.data, data) - self.assertIsNot(self.get_output(self.widget.Outputs.inliers), None) + self.send_signal(self.widget.Inputs.data, self.heart_disease) + self.assertIsNotNone(self.get_output(self.widget.Outputs.inliers)) + simulate.combobox_activate_index(self.widget.method_combo, + self.widget.Covariance) + self.assertIsNotNone(self.get_output(self.widget.Outputs.inliers)) + self.assertFalse(self.widget.Error.singular_cov.is_shown()) def test_in_out_summary(self): info = self.widget.info