diff --git a/pyproject.toml b/pyproject.toml index ce7e06e1..c344c895 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ maintainers = [ ] dependencies = [ - "narwhals>=1.2.0", + "narwhals>=1.5.0", "pandas>=1.1.5", "scikit-learn>=1.0", "importlib-metadata >= 1.0; python_version < '3.8'", diff --git a/sklego/common.py b/sklego/common.py index 548faea2..d3652277 100644 --- a/sklego/common.py +++ b/sklego/common.py @@ -4,11 +4,11 @@ import numpy as np import pandas as pd -from sklearn.base import TransformerMixin +from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_array, check_is_fitted, check_X_y -class TrainOnlyTransformerMixin(TransformerMixin): +class TrainOnlyTransformerMixin(TransformerMixin, BaseEstimator): """Mixin class for transformers that can handle training and test data differently. This mixin allows using a separate function for transforming training and test data. diff --git a/sklego/decomposition/pca_reconstruction.py b/sklego/decomposition/pca_reconstruction.py index 3dcc51aa..cb02ad21 100644 --- a/sklego/decomposition/pca_reconstruction.py +++ b/sklego/decomposition/pca_reconstruction.py @@ -4,7 +4,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted -class PCAOutlierDetection(BaseEstimator, OutlierMixin): +class PCAOutlierDetection(OutlierMixin, BaseEstimator): """`PCAOutlierDetection` is an outlier detector based on the reconstruction error from PCA. If the difference between original and reconstructed data is larger than the `threshold`, the point is diff --git a/sklego/decomposition/umap_reconstruction.py b/sklego/decomposition/umap_reconstruction.py index 330fe8f8..3859f490 100644 --- a/sklego/decomposition/umap_reconstruction.py +++ b/sklego/decomposition/umap_reconstruction.py @@ -11,7 +11,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted -class UMAPOutlierDetection(BaseEstimator, OutlierMixin): +class UMAPOutlierDetection(OutlierMixin, BaseEstimator): """`UMAPOutlierDetection` is an outlier detector based on the reconstruction error from UMAP. If the difference between original and reconstructed data is larger than the `threshold`, the point is diff --git a/sklego/dummy.py b/sklego/dummy.py index 35b4d639..03157161 100644 --- a/sklego/dummy.py +++ b/sklego/dummy.py @@ -11,7 +11,7 @@ ) -class RandomRegressor(BaseEstimator, RegressorMixin): +class RandomRegressor(RegressorMixin, BaseEstimator): """A `RandomRegressor` makes random predictions only based on the `y` value that is seen. The goal is that such a regressor can be used for benchmarking. It _should be_ easily beatable. diff --git a/sklego/linear_model.py b/sklego/linear_model.py index ebe9fc43..4673b608 100644 --- a/sklego/linear_model.py +++ b/sklego/linear_model.py @@ -27,7 +27,7 @@ ) -class LowessRegression(BaseEstimator, RegressorMixin): +class LowessRegression(RegressorMixin, BaseEstimator): """`LowessRegression` estimator: LOWESS (Locally Weighted Scatterplot Smoothing) is a type of [local regression](https://en.wikipedia.org/wiki/Local_regression). @@ -155,7 +155,7 @@ def predict(self, X): return results -class ProbWeightRegression(BaseEstimator, RegressorMixin): +class ProbWeightRegression(RegressorMixin, BaseEstimator): """`ProbWeightRegression` assumes that all input signals in `X` need to be reweighted with weights that sum up to one in order to predict `y`. @@ -276,7 +276,7 @@ def coefs_(self): return self.coef_ -class DeadZoneRegressor(BaseEstimator, RegressorMixin): +class DeadZoneRegressor(RegressorMixin, BaseEstimator): r"""The `DeadZoneRegressor` estimator implements a regression model that incorporates a _dead zone effect_ for improving the robustness of regression predictions. @@ -480,7 +480,7 @@ def allowed_effects(self): return self._ALLOWED_EFFECTS -class _FairClassifier(BaseEstimator, LinearClassifierMixin): +class _FairClassifier(LinearClassifierMixin, BaseEstimator): """Base class for fair classifiers that address sensitive attribute fairness. This base class provides a foundation for fair classifiers that aim to mitigate bias and discrimination by taking @@ -682,7 +682,7 @@ def _more_tags(self): return {"poor_score": True} -class DemographicParityClassifier(BaseEstimator, LinearClassifierMixin): +class DemographicParityClassifier(LinearClassifierMixin, BaseEstimator): r"""`DemographicParityClassifier` is a logistic regression classifier which can be constrained on demographic parity (p% score). @@ -800,7 +800,7 @@ def constraints(self, y_hat, y_true, sensitive, n_obs): return [] -class EqualOpportunityClassifier(BaseEstimator, LinearClassifierMixin): +class EqualOpportunityClassifier(LinearClassifierMixin, BaseEstimator): r"""`EqualOpportunityClassifier` is a logistic regression classifier which can be constrained on equal opportunity score. @@ -914,7 +914,7 @@ def constraints(self, y_hat, y_true, sensitive, n_obs): return [] -class BaseScipyMinimizeRegressor(BaseEstimator, RegressorMixin, ABC): +class BaseScipyMinimizeRegressor(RegressorMixin, BaseEstimator, ABC): """Abstract base class for regressors relying on Scipy's [minimize method](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html) to minimize a (custom) loss function. diff --git a/sklego/meta/_grouped_utils.py b/sklego/meta/_grouped_utils.py index c873e0cb..6d65ad3c 100644 --- a/sklego/meta/_grouped_utils.py +++ b/sklego/meta/_grouped_utils.py @@ -33,9 +33,9 @@ def parse_X_y(X, y, groups, check_X=True, **kwargs) -> nw.DataFrame: # Convert y and assign it to the frame n_samples = X.shape[0] - y_series = nw.from_dict( - data={"tmp": [None] * n_samples if y is None else y}, native_namespace=nw.get_native_namespace(X) - )["tmp"] + y_series = nw.new_series( + name="tmp", values=[None] * n_samples if y is None else y, native_namespace=nw.get_native_namespace(X) + ) return X.with_columns(__sklego_target__=y_series) diff --git a/sklego/meta/confusion_balancer.py b/sklego/meta/confusion_balancer.py index 8821d8b0..26b00fdc 100644 --- a/sklego/meta/confusion_balancer.py +++ b/sklego/meta/confusion_balancer.py @@ -7,7 +7,7 @@ from sklego.base import ProbabilisticClassifier -class ConfusionBalancer(BaseEstimator, MetaEstimatorMixin, ClassifierMixin): +class ConfusionBalancer(ClassifierMixin, MetaEstimatorMixin, BaseEstimator): r"""The `ConfusionBalancer` estimator attempts to give it's child estimator a more balanced output by learning from the confusion matrix during training. diff --git a/sklego/meta/grouped_predictor.py b/sklego/meta/grouped_predictor.py index 40878201..80eb819f 100644 --- a/sklego/meta/grouped_predictor.py +++ b/sklego/meta/grouped_predictor.py @@ -402,7 +402,7 @@ def _more_tags(self): return {"allow_nan": True} -class GroupedRegressor(GroupedPredictor, RegressorMixin): +class GroupedRegressor(RegressorMixin, GroupedPredictor): """`GroupedRegressor` is a meta-estimator that fits a separate regressor for each group in the input data. Its spec is the same as [`GroupedPredictor`][sklego.meta.grouped_predictor.GroupedPredictor] but it is available @@ -439,7 +439,7 @@ def fit(self, X, y): return super().fit(X, y) -class GroupedClassifier(GroupedPredictor, ClassifierMixin): +class GroupedClassifier(ClassifierMixin, GroupedPredictor): """`GroupedClassifier` is a meta-estimator that fits a separate classifier for each group in the input data. Its equivalent to [`GroupedPredictor`][sklego.meta.grouped_predictor.GroupedPredictor] with `shrinkage=None` diff --git a/sklego/meta/hierarchical_predictor.py b/sklego/meta/hierarchical_predictor.py index 5d71cc5c..058d0f0b 100644 --- a/sklego/meta/hierarchical_predictor.py +++ b/sklego/meta/hierarchical_predictor.py @@ -282,10 +282,10 @@ def fit(self, X, y=None): raise ValueError(msg) native_namespace = nw.get_native_namespace(X) - target_series = nw.from_dict({self._TARGET_NAME: y}, native_namespace=native_namespace)[self._TARGET_NAME] - global_series = nw.from_dict({self._GLOBAL_NAME: np.ones(n_samples)}, native_namespace=native_namespace)[ - self._GLOBAL_NAME - ] + target_series = nw.new_series(name=self._TARGET_NAME, values=y, native_namespace=native_namespace) + global_series = nw.new_series( + name=self._GLOBAL_NAME, values=np.ones(n_samples), native_namespace=native_namespace + ) frame = X.with_columns( **{ self._TARGET_NAME: target_series, @@ -322,9 +322,9 @@ def _predict_estimators(self, X, method_name): n_samples = X.shape[0] native_namespace = nw.get_native_namespace(X) - global_series = nw.from_dict({self._GLOBAL_NAME: np.ones(n_samples)}, native_namespace=native_namespace)[ - self._GLOBAL_NAME - ] + global_series = nw.new_series( + name=self._GLOBAL_NAME, values=np.ones(n_samples), native_namespace=native_namespace + ) frame = X.with_columns( **{ @@ -424,7 +424,7 @@ def _more_tags(self): return {"allow_nan": True} -class HierarchicalRegressor(HierarchicalPredictor, RegressorMixin): +class HierarchicalRegressor(RegressorMixin, HierarchicalPredictor): """A hierarchical regressor that predicts values using hierarchical grouping. This class extends [`HierarchicalPredictor`][sklego.meta.hierarchical_predictor.HierarchicalPredictor] and adds @@ -537,7 +537,7 @@ def predict(self, X): return self._predict_estimators(X, "predict") -class HierarchicalClassifier(HierarchicalPredictor, ClassifierMixin): +class HierarchicalClassifier(ClassifierMixin, HierarchicalPredictor): """A hierarchical classifier that predicts labels using hierarchical grouping. This class extends [`HierarchicalPredictor`][sklego.meta.hierarchical_predictor.HierarchicalPredictor] and adds diff --git a/sklego/meta/outlier_classifier.py b/sklego/meta/outlier_classifier.py index 09f6d50d..d965e443 100644 --- a/sklego/meta/outlier_classifier.py +++ b/sklego/meta/outlier_classifier.py @@ -7,7 +7,7 @@ from sklego.base import OutlierModel -class OutlierClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin): +class OutlierClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator): """Morphs an outlier detection model into a classifier. When an outlier is detected it will output 1 and 0 otherwise. This way you can use familiar metrics again and this diff --git a/sklego/meta/regression_outlier_detector.py b/sklego/meta/regression_outlier_detector.py index 6ef8a8b2..4c51267a 100644 --- a/sklego/meta/regression_outlier_detector.py +++ b/sklego/meta/regression_outlier_detector.py @@ -5,7 +5,7 @@ from sklearn.utils.validation import check_array, check_is_fitted -class RegressionOutlierDetector(BaseEstimator, OutlierMixin): +class RegressionOutlierDetector(OutlierMixin, BaseEstimator): """Morphs a regression estimator into one that can detect outliers. We will try to predict `column` in X. Parameters diff --git a/sklego/meta/subjective_classifier.py b/sklego/meta/subjective_classifier.py index 60e72463..b396bddc 100644 --- a/sklego/meta/subjective_classifier.py +++ b/sklego/meta/subjective_classifier.py @@ -6,7 +6,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted, check_X_y -class SubjectiveClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin): +class SubjectiveClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator): """Corrects predictions of the inner classifier by taking into account a (subjective) prior distribution of the classes. diff --git a/sklego/meta/thresholder.py b/sklego/meta/thresholder.py index b08e76b8..126071f0 100644 --- a/sklego/meta/thresholder.py +++ b/sklego/meta/thresholder.py @@ -10,7 +10,7 @@ from sklego.base import ProbabilisticClassifier -class Thresholder(BaseEstimator, ClassifierMixin): +class Thresholder(ClassifierMixin, BaseEstimator): """Takes a binary classifier and moves the threshold. This way you might design the algorithm to only accept a certain class if the probability for it is larger than, say, 90% instead of 50%. diff --git a/sklego/meta/zero_inflated_regressor.py b/sklego/meta/zero_inflated_regressor.py index 18d41a14..3b41626b 100644 --- a/sklego/meta/zero_inflated_regressor.py +++ b/sklego/meta/zero_inflated_regressor.py @@ -8,7 +8,7 @@ from sklearn.utils.validation import _check_sample_weight, check_array, check_is_fitted, check_X_y -class ZeroInflatedRegressor(BaseEstimator, RegressorMixin, MetaEstimatorMixin): +class ZeroInflatedRegressor(RegressorMixin, BaseEstimator, MetaEstimatorMixin): """A meta regressor for zero-inflated datasets, i.e. the targets contain a lot of zeroes. `ZeroInflatedRegressor` consists of a classifier and a regressor. diff --git a/sklego/mixture/bayesian_gmm_classifier.py b/sklego/mixture/bayesian_gmm_classifier.py index 66b6b5e0..805420df 100644 --- a/sklego/mixture/bayesian_gmm_classifier.py +++ b/sklego/mixture/bayesian_gmm_classifier.py @@ -7,7 +7,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted -class BayesianGMMClassifier(BaseEstimator, ClassifierMixin): +class BayesianGMMClassifier(ClassifierMixin, BaseEstimator): """The `BayesianGMMClassifier` trains a Gaussian Mixture Model for each class in `y` on a dataset `X`. Once a density is trained for each class we can evaluate the likelihood scores to see which class is more likely. diff --git a/sklego/mixture/gmm_classifier.py b/sklego/mixture/gmm_classifier.py index 01044325..9b6705a5 100644 --- a/sklego/mixture/gmm_classifier.py +++ b/sklego/mixture/gmm_classifier.py @@ -7,7 +7,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted -class GMMClassifier(BaseEstimator, ClassifierMixin): +class GMMClassifier(ClassifierMixin, BaseEstimator): """The `GMMClassifier` trains a Gaussian Mixture Model for each class in `y` on a dataset `X`. Once a density is trained for each class we can evaluate the likelihood scores to see which class is more likely. diff --git a/sklego/model_selection.py b/sklego/model_selection.py index 9fa26132..08747492 100644 --- a/sklego/model_selection.py +++ b/sklego/model_selection.py @@ -263,8 +263,8 @@ def update_split_info(indices, j, part, summary): j = 0 for i in self.split(nw.to_native(X)): - train_info = nw.to_native(nw.from_dict({"tmp": i[0]}, native_namespace=native_namespace)["tmp"]) - valid_info = nw.to_native(nw.from_dict({"tmp": i[1]}, native_namespace=native_namespace)["tmp"]) + train_info = nw.to_native(nw.new_series(name="tmp", values=i[0], native_namespace=native_namespace)) + valid_info = nw.to_native(nw.new_series(name="tmp", values=i[1], native_namespace=native_namespace)) update_split_info(train_info, j, "train", summary) update_split_info(valid_info, j, "valid", summary) j = j + 1 diff --git a/sklego/naive_bayes.py b/sklego/naive_bayes.py index a3fab146..2ed87aed 100644 --- a/sklego/naive_bayes.py +++ b/sklego/naive_bayes.py @@ -8,7 +8,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted -class GaussianMixtureNB(BaseEstimator, ClassifierMixin): +class GaussianMixtureNB(ClassifierMixin, BaseEstimator): """The `GaussianMixtureNB` estimator is a naive bayes classifier that uses a mixture of gaussians instead of merely a single one. In particular it trains a `GaussianMixture` model for each class in the target and for each feature in the data, on the subset of `X` where `y == class`. @@ -158,7 +158,7 @@ def num_fit_cols_(self): return self.n_features_in_ -class BayesianGaussianMixtureNB(BaseEstimator, ClassifierMixin): +class BayesianGaussianMixtureNB(ClassifierMixin, BaseEstimator): """The `BayesianGaussianMixtureNB` estimator is a naive bayes classifier that uses a bayesian mixture of gaussians instead of merely a single one. In particular it trains a `BayesianGaussianMixture` model for each class in the target and for each feature in the data, on the subset of `X` where `y == class`. diff --git a/sklego/neighbors.py b/sklego/neighbors.py index 55cdbe19..9a35ba0c 100644 --- a/sklego/neighbors.py +++ b/sklego/neighbors.py @@ -6,7 +6,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted -class BayesianKernelDensityClassifier(BaseEstimator, ClassifierMixin): +class BayesianKernelDensityClassifier(ClassifierMixin, BaseEstimator): """The `BayesianKernelDensityClassifier` estimator trains using Kernel Density estimations to generate the joint distribution. diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index 2af07cb3..faccfa98 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -60,7 +60,7 @@ def _nw_select_dtypes(include: str | list[str], exclude: str | list[str], schema return feature_names -class ColumnDropper(BaseEstimator, TransformerMixin): +class ColumnDropper(TransformerMixin, BaseEstimator): """The `ColumnDropper` transformer allows dropping specific columns from a DataFrame by name. Can be useful in a sklearn Pipeline. @@ -226,7 +226,7 @@ def _check_column_names(self, X): raise KeyError(f"{list(non_existent_columns)} column(s) not in DataFrame") -class TypeSelector(BaseEstimator, TransformerMixin): +class TypeSelector(TransformerMixin, BaseEstimator): """The `TypeSelector` transformer allows to select columns in a DataFrame based on their type. Can be useful in a sklearn Pipeline. @@ -412,7 +412,7 @@ def __init__(self, include=None, exclude=None): super().__init__(include=include, exclude=exclude) -class ColumnSelector(BaseEstimator, TransformerMixin): +class ColumnSelector(TransformerMixin, BaseEstimator): """The `ColumnSelector` transformer allows selecting specific columns from a DataFrame by name. Can be useful in a sklearn Pipeline. diff --git a/tests/test_meta/test_grouped_predictor.py b/tests/test_meta/test_grouped_predictor.py index 93a181f0..cc08a874 100644 --- a/tests/test_meta/test_grouped_predictor.py +++ b/tests/test_meta/test_grouped_predictor.py @@ -32,6 +32,7 @@ def test_sklearn_compatible_estimator(estimator, check): "check_fit2d_predict1d", # custom message "check_estimators_empty_data_messages", # custom message "check_supervised_y_2d", # TODO: Is it possible to support multioutput? + "check_requires_y_none", }: pytest.skip() diff --git a/tests/test_meta/test_hierarchical_predictor.py b/tests/test_meta/test_hierarchical_predictor.py index a6bbc465..02d9d321 100644 --- a/tests/test_meta/test_hierarchical_predictor.py +++ b/tests/test_meta/test_hierarchical_predictor.py @@ -31,6 +31,7 @@ def test_sklearn_compatible_estimator(estimator, check): "check_fit2d_1feature", # custom message "check_supervised_y_2d", # TODO: Is it possible to support multioutput? "check_estimators_empty_data_messages", # custom message + "check_requires_y_none", }: pytest.skip()