Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Outliers: Save model into compute_value #4372

Merged
merged 3 commits into from
Jan 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 132 additions & 21 deletions Orange/classification/outlier_detection.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,111 @@
# pylint: disable=unused-argument
import numpy as np

from Orange.data.table import DomainTransformationError
from Orange.data.util import get_unique_names
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

from Orange.base import SklLearner, SklModel
from Orange.data import Table, Domain
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable, \
Variable
from Orange.preprocess import AdaptiveNormalize
from Orange.statistics.util import all_nan

__all__ = ["LocalOutlierFactorLearner", "IsolationForestLearner",
"EllipticEnvelopeLearner"]


class _OutlierDetector(SklLearner):
def __call__(self, data: Table):
data = data.transform(Domain(data.domain.attributes))
return super().__call__(data)
"EllipticEnvelopeLearner", "OneClassSVMLearner"]


class _OutlierModel(SklModel):
def __init__(self, skl_model):
super().__init__(skl_model)
self._cached_data = None
self.outlier_var = None

def predict(self, X: np.ndarray) -> np.ndarray:
pred = self.skl_model.predict(X)
pred[pred == -1] = 0
return pred[:, None]

def __call__(self, data: Table) -> Table:
assert isinstance(data, Table)
assert self.outlier_var is not None

domain = Domain(data.domain.attributes, data.domain.class_vars,
data.domain.metas + (self.outlier_var,))
self._cached_data = self.data_to_model_domain(data)
metas = np.hstack((data.metas, self.predict(self._cached_data.X)))
return Table.from_numpy(domain, data.X, data.Y, metas)

def data_to_model_domain(self, data: Table) -> Table:
if data.domain == self.domain:
return data

if self.original_domain.attributes != data.domain.attributes \
and data.X.size \
and not all_nan(data.X):
new_data = data.transform(self.original_domain)
if all_nan(new_data.X):
raise DomainTransformationError(
"domain transformation produced no defined values")
return new_data.transform(self.domain)
return data.transform(self.domain)


class _OutlierLearner(SklLearner):
__returns__ = _OutlierModel
supports_multiclass = True

def _fit_model(self, data: Table) -> _OutlierModel:
domain = data.domain
model = super()._fit_model(data.transform(Domain(domain.attributes)))

transformer = _Transformer(model)
names = [v.name for v in domain.variables + domain.metas]
variable = DiscreteVariable(
get_unique_names(names, "Outlier"),
values=["Yes", "No"],
compute_value=transformer
)

transformer.variable = variable
model.outlier_var = variable
return model


class _Transformer:
def __init__(self, model: _OutlierModel):
self._model = model
self._variable = None

@property
def variable(self) -> Variable:
return self._variable

@variable.setter
def variable(self, var: Variable):
self._variable = var

def __call__(self, data: Table) -> np.ndarray:
assert isinstance(self._variable, Variable)
return self._model(data).get_column_view(self._variable)[0]


class OneClassSVMLearner(_OutlierLearner):
name = "One class SVM"
__wraps__ = OneClassSVM
preprocessors = SklLearner.preprocessors + [AdaptiveNormalize()]

def __init__(self, kernel='rbf', degree=3, gamma="auto", coef0=0.0,
tol=0.001, nu=0.5, shrinking=True, cache_size=200,
max_iter=-1, preprocessors=None):
super().__init__(preprocessors=preprocessors)
self.params = vars()


class LocalOutlierFactorLearner(_OutlierDetector):
class LocalOutlierFactorLearner(_OutlierLearner):
__wraps__ = LocalOutlierFactor
name = "Local Outlier Factor"

Expand All @@ -27,7 +117,7 @@ def __init__(self, n_neighbors=20, algorithm="auto", leaf_size=30,
self.params = vars()


class IsolationForestLearner(_OutlierDetector):
class IsolationForestLearner(_OutlierLearner):
__wraps__ = IsolationForest
name = "Isolation Forest"

Expand All @@ -39,25 +129,34 @@ def __init__(self, n_estimators=100, max_samples='auto',
self.params = vars()


class EllipticEnvelopeClassifier(SklModel):
def mahalanobis(self, observations):
class EllipticEnvelopeClassifier(_OutlierModel):
def __init__(self, skl_model):
super().__init__(skl_model)
self.mahal_var = None

def mahalanobis(self, observations: np.ndarray) -> np.ndarray:
"""Computes squared Mahalanobis distances of given observations.

Parameters
----------
observations : ndarray (n_samples, n_features) or Orange Table
observations : ndarray (n_samples, n_features)

Returns
-------
distances : ndarray (n_samples,)
distances : ndarray (n_samples, 1)
Squared Mahalanobis distances given observations.
"""
if isinstance(observations, Table):
observations = observations.X
return self.skl_model.mahalanobis(observations)
return self.skl_model.mahalanobis(observations)[:, None]

def __call__(self, data: Table) -> Table:
pred = super().__call__(data)
domain = Domain(pred.domain.attributes, pred.domain.class_vars,
pred.domain.metas + (self.mahal_var,))
metas = np.hstack((pred.metas, self.mahalanobis(self._cached_data.X)))
return Table.from_numpy(domain, pred.X, pred.Y, metas)


class EllipticEnvelopeLearner(_OutlierDetector):
class EllipticEnvelopeLearner(_OutlierLearner):
__wraps__ = EllipticEnvelope
__returns__ = EllipticEnvelopeClassifier
name = "Covariance Estimator"
Expand All @@ -68,6 +167,18 @@ def __init__(self, store_precision=True, assume_centered=False,
super().__init__(preprocessors=preprocessors)
self.params = vars()

def __call__(self, data: Table):
data = data.transform(Domain(data.domain.attributes))
return super().__call__(data)
def _fit_model(self, data: Table) -> EllipticEnvelopeClassifier:
domain = data.domain
model = super()._fit_model(data.transform(Domain(domain.attributes)))

transformer = _Transformer(model)
names = [v.name for v in domain.variables + domain.metas]
variable = ContinuousVariable(
get_unique_names(names, "Mahalanobis"),
compute_value=transformer
)

transformer.variable = variable
model.mahal_var = variable
return model

27 changes: 1 addition & 26 deletions Orange/classification/svm.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import sklearn.svm as skl_svm

from Orange.base import SklLearner as SklLearnerBase
from Orange.classification import SklLearner, SklModel
from Orange.data import Domain
from Orange.preprocess import AdaptiveNormalize

__all__ = ["SVMLearner", "LinearSVMLearner", "NuSVMLearner",
"OneClassSVMLearner"]
__all__ = ["SVMLearner", "LinearSVMLearner", "NuSVMLearner"]

svm_pps = SklLearner.preprocessors + [AdaptiveNormalize()]

Expand Down Expand Up @@ -62,28 +59,6 @@ def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma="auto", coef0=0.0,
self.params = vars()


class OneClassSVMLearner(SklLearnerBase):
name = "One class SVM"
__wraps__ = skl_svm.OneClassSVM
preprocessors = svm_pps

def __init__(self, kernel='rbf', degree=3, gamma="auto", coef0=0.0,
tol=0.001, nu=0.5, shrinking=True, cache_size=200,
max_iter=-1, preprocessors=None):
super().__init__(preprocessors=preprocessors)
self.params = vars()

def __call__(self, data):
classless_data = data.transform(Domain(data.domain.attributes))
return super().__call__(classless_data)

def fit(self, X, Y=None, W=None):
clf = self.__wraps__(**self.params)
if W is not None:
return self.__returns__(clf.fit(X, W.reshape(-1)))
return self.__returns__(clf.fit(X))


if __name__ == '__main__':
import Orange

Expand Down
Loading