fix cuml

tvdboom · Aug 28, 2023 · 91090b4 · 91090b4
1 parent c17443e
commit 91090b4
Show file tree

Hide file tree

Showing 14 changed files with 163 additions and 161 deletions.
diff --git a/atom/api.py b/atom/api.py
@@ -229,18 +229,18 @@ class ATOMClassifier(BaseTransformer, ATOM):
 
     engine: dict or None, default=None
         Execution engine to use for [data][data-acceleration] and
-        [models][model-acceleration]. The value should be a dictionary
-        with keys `data` and/or `models`, with their corresponding
-        choice as values. If None, the default options are selected.
-        Choose from:
+        [estimators][estimator-acceleration]. The value should be a
+        dictionary with keys `data` and/or `estimator`, with their
+        corresponding choice as values. If None, the default options
+        are selected. Choose from:
 
         - "data":
 
             - "numpy" (default)
             - "pyarrow"
             - "modin"
 
-        - "models":
+        - "estimator":
 
             - "sklearn" (default)
             - "sklearnex"
@@ -455,18 +455,18 @@ class ATOMForecaster(BaseTransformer, ATOM):
 
     engine: dict or None, default=None
         Execution engine to use for [data][data-acceleration] and
-        [models][model-acceleration]. The value should be a dictionary
-        with keys `data` and/or `models`, with their corresponding
-        choice as values. If None, the default options are selected.
-        Choose from:
+        [estimators][estimator-acceleration]. The value should be a
+        dictionary with keys `data` and/or `estimator`, with their
+        corresponding choice as values. If None, the default options
+        are selected. Choose from:
 
         - "data":
 
             - "numpy" (default)
             - "pyarrow"
             - "modin"
 
-        - "models":
+        - "estimator":
 
             - "sklearn" (default)
             - "sklearnex"
@@ -684,18 +684,18 @@ class ATOMRegressor(BaseTransformer, ATOM):
 
     engine: dict or None, default=None
         Execution engine to use for [data][data-acceleration] and
-        [models][model-acceleration]. The value should be a dictionary
-        with keys `data` and/or `models`, with their corresponding
-        choice as values. If None, the default options are selected.
-        Choose from:
+        [estimators][estimator-acceleration]. The value should be a
+        dictionary with keys `data` and/or `estimator`, with their
+        corresponding choice as values. If None, the default options
+        are selected. Choose from:
 
         - "data":
 
             - "numpy" (default)
             - "pyarrow"
             - "modin"
 
-        - "models":
+        - "estimator":
 
             - "sklearn" (default)
             - "sklearnex"

diff --git a/atom/atom.py b/atom/atom.py
@@ -120,8 +120,8 @@ def __init__(
             self.log("GPU training enabled.", 1)
         if (data := self.engine.get("data")) != "numpy":
             self.log(f"Data engine: {data}.", 1)
-        if (models := self.engine.get("models")) != "sklearn":
-            self.log(f"Models engine: {models}.", 1)
+        if (models := self.engine.get("estimator")) != "sklearn":
+            self.log(f"Estimator engine: {models}.", 1)
         if self.backend == "ray" or self.n_jobs > 1:
             self.log(f"Parallelization backend: {self.backend}", 1)
         if self.experiment:

diff --git a/atom/basemodel.py b/atom/basemodel.py
@@ -109,18 +109,18 @@ class BaseModel(BaseTransformer, BaseTracker, HTPlot, PredictionPlot, ShapPlot):
 
     engine: dict or None, default=None
         Execution engine to use for [data][data-acceleration] and
-        [models][model-acceleration]. The value should be a dictionary
-        with keys `data` and/or `models`, with their corresponding
-        choice as values. If None, the default options are selected.
-        Choose from:
+        [estimators][estimator-acceleration]. The value should be a
+        dictionary with keys `data` and/or `estimator`, with their
+        corresponding choice as values. If None, the default options
+        are selected. Choose from:
 
         - "data":
 
             - "numpy" (default)
             - "pyarrow"
             - "modin"
 
-        - "models":
+        - "estimator":
 
             - "sklearn" (default)
             - "sklearnex"

diff --git a/atom/basetransformer.py b/atom/basetransformer.py
@@ -55,7 +55,7 @@ class BaseTransformer:
         - backend: Parallelization backend.
         - verbose: Verbosity level of the output.
         - warnings: Whether to show or suppress encountered warnings.
-        - logger: Name of the log file or Logger object.
+        - logger: Name of the log file, Logger object or None.
         - experiment: Name of the mlflow experiment used for tracking.
         - random_state: Seed used by the random number generator.
 
@@ -120,8 +120,8 @@ def engine(self) -> dict:
     @engine.setter
     def engine(self, value: dict | None):
         if not value:
-            value = {"data": "numpy", "models": "sklearn"}
-        elif "data" not in value and "models" not in value:
+            value = {"data": "numpy", "estimator": "sklearn"}
+        elif "data" not in value and "estimator" not in value:
             raise ValueError(
                 f"Invalid value for the engine parameter, got {value}. "
                 "The value should be a dict with keys 'data' and/or 'models'."
@@ -145,7 +145,7 @@ def engine(self, value: dict | None):
         # Update env variable to use for PandasModin in utils.py
         os.environ["ATOM_DATA_ENGINE"] = value["data"].lower()
 
-        if models := value.get("models"):
+        if models := value.get("estimator"):
             if models.lower() == "sklearnex":
                 if not find_spec("sklearnex"):
                     raise ModuleNotFoundError(
@@ -171,7 +171,7 @@ def engine(self, value: dict | None):
                     f"got {models}. Choose from: sklearn, sklearnex, cuml."
                 )
         else:
-            value["models"] = "sklearn"
+            value["estimator"] = "sklearn"
 
         self._engine = value
 

diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py
@@ -546,18 +546,18 @@ class Cleaner(BaseEstimator, TransformerMixin, BaseTransformer):
 
     engine: dict or None, default=None
         Execution engine to use for [data][data-acceleration] and
-        [models][model-acceleration]. The value should be a dictionary
-        with keys `data` and/or `models`, with their corresponding
-        choice as values. If None, the default options are selected.
-        Choose from:
+        [estimators][estimator-acceleration]. The value should be a
+        dictionary with keys `data` and/or `estimator`, with their
+        corresponding choice as values. If None, the default options
+        are selected. Choose from:
 
         - "data":
 
             - "numpy" (default)
             - "pyarrow"
             - "modin"
 
-        - "models":
+        - "estimator":
 
             - "sklearn" (default)
             - "cuml"
@@ -979,18 +979,18 @@ class Discretizer(BaseEstimator, TransformerMixin, BaseTransformer):
 
     engine: dict or None, default=None
         Execution engine to use for [data][data-acceleration] and
-        [models][model-acceleration]. The value should be a dictionary
-        with keys `data` and/or `models`, with their corresponding
-        choice as values. If None, the default options are selected.
-        Choose from:
+        [estimators][estimator-acceleration]. The value should be a
+        dictionary with keys `data` and/or `estimator`, with their
+        corresponding choice as values. If None, the default options
+        are selected. Choose from:
 
         - "data":
 
             - "numpy" (default)
             - "pyarrow"
             - "modin"
 
-        - "models":
+        - "estimator":
 
             - "sklearn" (default)
             - "cuml"
@@ -1701,18 +1701,18 @@ class Imputer(BaseEstimator, TransformerMixin, BaseTransformer):
 
     engine: dict or None, default=None
         Execution engine to use for [data][data-acceleration] and
-        [models][model-acceleration]. The value should be a dictionary
-        with keys `data` and/or `models`, with their corresponding
-        choice as values. If None, the default options are selected.
-        Choose from:
+        [estimators][estimator-acceleration]. The value should be a
+        dictionary with keys `data` and/or `estimator`, with their
+        corresponding choice as values. If None, the default options
+        are selected. Choose from:
 
         - "data":
 
             - "numpy" (default)
             - "pyarrow"
             - "modin"
 
-        - "models":
+        - "estimator":
 
             - "sklearn" (default)
             - "cuml"
@@ -1896,7 +1896,7 @@ def fit(self, X: FEATURES, y: TARGET | None = None) -> Imputer:
         # Load the imputer class from sklearn or cuml (different modules)
         estimator = self._get_est_class(
             name="SimpleImputer",
-            module="preprocessing" if self.engine["models"] == "cuml" else "impute",
+            module="preprocessing" if self.engine["estimator"] == "cuml" else "impute",
         )
 
         # Assign an imputer to each column
@@ -2102,18 +2102,18 @@ class Normalizer(BaseEstimator, TransformerMixin, BaseTransformer):
 
     engine: dict or None, default=None
         Execution engine to use for [data][data-acceleration] and
-        [models][model-acceleration]. The value should be a dictionary
-        with keys `data` and/or `models`, with their corresponding
-        choice as values. If None, the default options are selected.
-        Choose from:
+        [estimators][estimator-acceleration]. The value should be a
+        dictionary with keys `data` and/or `estimator`, with their
+        corresponding choice as values. If None, the default options
+        are selected. Choose from:
 
         - "data":
 
             - "numpy" (default)
             - "pyarrow"
             - "modin"
 
-        - "models":
+        - "estimator":
 
             - "sklearn" (default)
             - "cuml"
@@ -2401,18 +2401,18 @@ class Pruner(BaseEstimator, TransformerMixin, BaseTransformer):
 
     engine: dict or None, default=None
         Execution engine to use for [data][data-acceleration] and
-        [models][model-acceleration]. The value should be a dictionary
-        with keys `data` and/or `models`, with their corresponding
-        choice as values. If None, the default options are selected.
-        Choose from:
+        [estimators][estimator-acceleration]. The value should be a
+        dictionary with keys `data` and/or `estimator`, with their
+        corresponding choice as values. If None, the default options
+        are selected. Choose from:
 
         - "data":
 
             - "numpy" (default)
             - "pyarrow"
             - "modin"
 
-        - "models":
+        - "estimator":
 
             - "sklearn" (default)
             - "sklearnex"
@@ -2706,18 +2706,18 @@ class Scaler(BaseEstimator, TransformerMixin, BaseTransformer):
 
     engine: dict or None, default=None
         Execution engine to use for [data][data-acceleration] and
-        [models][model-acceleration]. The value should be a dictionary
-        with keys `data` and/or `models`, with their corresponding
-        choice as values. If None, the default options are selected.
-        Choose from:
+        [estimators][estimator-acceleration]. The value should be a
+        dictionary with keys `data` and/or `estimator`, with their
+        corresponding choice as values. If None, the default options
+        are selected. Choose from:
 
         - "data":
 
             - "numpy" (default)
             - "pyarrow"
             - "modin"
 
-        - "models":
+        - "estimator":
 
             - "sklearn" (default)
             - "cuml"

diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py
@@ -998,18 +998,18 @@ class FeatureSelector(
 
     engine: dict or None, default=None
         Execution engine to use for [data][data-acceleration] and
-        [models][model-acceleration]. The value should be a dictionary
-        with keys `data` and/or `models`, with their corresponding
-        choice as values. If None, the default options are selected.
-        Choose from:
+        [estimators][estimator-acceleration]. The value should be a
+        dictionary with keys `data` and/or `estimator`, with their
+        corresponding choice as values. If None, the default options
+        are selected. Choose from:
 
         - "data":
 
             - "numpy" (default)
             - "pyarrow"
             - "modin"
 
-        - "models":
+        - "estimator":
 
             - "sklearn" (default)
             - "sklearnex"
@@ -1411,35 +1411,31 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
             self._estimator = SelectKBest(solver, k=self._n_features).fit(X, y)
 
         elif self.strategy.lower() == "pca":
-            # The PCA and TruncatedSVD both get all possible components to use
-            # for the plots (n_components must be < n_features and <= n_rows)
-            if is_sparse(X):
-                estimator = self._get_est_class("TruncatedSVD", "decomposition")
-
-                self._estimator = estimator(
-                    n_components=min(len(X), X.shape[1] - 1),
-                    algorithm="randomized" if self.solver is None else self.solver,
-                    random_state=self.random_state,
-                    **self.kwargs,
-                )
-            else:
+            if not is_sparse(X):
+                # PCA requires the features to be scaled
                 if not check_scaling(X):
                     self.scaler = Scaler()
                     X = self.scaler.fit_transform(X)
 
                 estimator = self._get_est_class("PCA", "decomposition")
+                solver_param = "svd_solver"
+            else:
+                estimator = self._get_est_class("TruncatedSVD", "decomposition")
+                solver_param = "algorithm"
 
-                if self.solver is None:
-                    solver = sign(estimator)["svd_solver"].default
-                else:
-                    solver = self.solver
+            if self.solver is None:
+                solver = sign(estimator)[solver_param].default
+            else:
+                solver = self.solver
 
-                self._estimator = estimator(
-                    n_components=min(len(X), X.shape[1] - 1),
-                    svd_solver=solver,
-                    random_state=self.random_state,
-                    **self.kwargs,
-                )
+            # The PCA and TruncatedSVD both get all possible components to use
+            # for the plots (n_components must be < n_features and <= n_rows)
+            self._estimator = estimator(
+                n_components=min(len(X), X.shape[1] - 1),
+                **{solver_param: solver},
+                random_state=self.random_state,
+                **self.kwargs,
+            )
 
             self._estimator.fit(X)
             self._estimator._comps = min(