Skip to content

Commit

Permalink
Adding sklearn 0.24 support (#1016)
Browse files Browse the repository at this point in the history
* Adding importable helper functions

* Changing import of cat, cont

* Better docstrings

* Adding unit test to check ColumnTransformer

* Refinements from @mfeurer

* Editing example to support both NumPy and Pandas

* Unit test fix to mark for deletion

* Making some unit tests work

* Waiting for dataset to be processed

* Minor test collection fix

* Template to handle missing tasks

* Accounting for more missing tasks:

* Fixing some more unit tests

* Simplifying check_task_existence

* black changes

* Minor formatting

* Handling task exists check

* Testing edited check task func

* Flake fix

* More retries on connection error

* Adding max_retries to config default

* Update database retry unit test

* Print to debug hash exception

* Fixing checksum unit test

* Retry on _download_text_file

* Update datasets_tutorial.py

* Update custom_flow_tutorial.py

* Update test_study_functions.py

* Update test_dataset_functions.py

* more retries, but also more time between retries

* allow for even more retries on get calls

* Catching failed get task

* undo stupid change

* fix one more test

* Refactoring md5 hash check inside _send_request

* Fixing a fairly common unit test fail

* Reverting loose check on unit test

* Updating examples to run on sklearn 0.24

* Spawning tests for sklearn 0.24

* Adding numpy import

* Fixing integer type check to allow np.integer

* Making unit tests run on sklearn 0.24

* black fix

* Trying to loosen check on unit test as fix

* simplify examples

* disable test for old python version

Co-authored-by: Matthias Feurer <feurerm@informatik.uni-freiburg.de>
Co-authored-by: PGijsbers <p.gijsbers@tue.nl>
Co-authored-by: neeratyoy <>
  • Loading branch information
3 people authored Feb 11, 2021
1 parent 80ae046 commit d2945ba
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ubuntu-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
scikit-learn: [0.21.2, 0.22.2, 0.23.1]
scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24]
exclude: # no scikit-learn 0.21.2 release for Python 3.8
- python-version: 3.8
scikit-learn: 0.21.2
Expand Down
48 changes: 20 additions & 28 deletions examples/30_extended/flows_and_runs_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
# License: BSD 3-Clause

import openml
import numpy as np
from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree

############################################################################
Expand Down Expand Up @@ -54,7 +53,7 @@
task = openml.tasks.get_task(403)

# Build any classifier or pipeline
clf = tree.ExtraTreeClassifier()
clf = tree.DecisionTreeClassifier()

# Run the flow
run = openml.runs.run_model_on_task(clf, task)
Expand Down Expand Up @@ -83,7 +82,10 @@
# ############################
#
# When you need to handle 'dirty' data, build pipelines to model then automatically.
task = openml.tasks.get_task(1)
# To demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via
# `task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical
# variables and missing values in both.
task = openml.tasks.get_task(96)

# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
from openml.extensions.sklearn import cat, cont
Expand All @@ -96,20 +98,14 @@
[
(
"categorical",
pipeline.Pipeline(
[
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
(
"Encoder",
preprocessing.OneHotEncoder(
sparse=False, handle_unknown="ignore"
),
),
]
),
preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
cat, # returns the categorical feature indices
),
("continuous", "passthrough", cont), # returns the numeric feature indices
(
"continuous",
impute.SimpleImputer(strategy="median"),
cont,
), # returns the numeric feature indices
]
),
),
Expand Down Expand Up @@ -146,20 +142,14 @@
[
(
"categorical",
pipeline.Pipeline(
[
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
(
"Encoder",
preprocessing.OneHotEncoder(
sparse=False, handle_unknown="ignore"
),
),
]
),
preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
categorical_feature_indices,
),
("continuous", "passthrough", numeric_feature_indices),
(
"continuous",
impute.SimpleImputer(strategy="median"),
numeric_feature_indices,
),
]
),
),
Expand All @@ -182,7 +172,9 @@
task = openml.tasks.get_task(6)

# The following lines can then be executed offline:
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)
run = openml.runs.run_model_on_task(
pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array",
)

# The run may be stored offline, and the flow will be stored along with it:
run.to_filesystem(directory="myrun")
Expand Down
9 changes: 3 additions & 6 deletions examples/30_extended/run_setup_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,9 @@
# easy as you want it to be


cat_imp = make_pipeline(
SimpleImputer(strategy="most_frequent"),
OneHotEncoder(handle_unknown="ignore", sparse=False),
TruncatedSVD(),
)
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),)
cont_imp = SimpleImputer(strategy="median")
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])

# Let's change some hyperparameters. Of course, in any good application we
Expand Down
10 changes: 3 additions & 7 deletions examples/40_paper/2018_neurips_perrone_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"):
cat_cols = list_categorical_attributes(flow_type=flow_type)
num_cols = list(set(X.columns) - set(cat_cols))

# Missing value imputers
cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None")
# Missing value imputers for numeric columns
num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)

# Creating the one-hot encoder
# Creating the one-hot encoder for numerical representation of categorical columns
enc = OneHotEncoder(handle_unknown="ignore")

# Pipeline to handle categorical column transformations
cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)])

# Combining column transformers
ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)])
ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])

# Creating the full pipeline with the surrogate model
clf = RandomForestRegressor(n_estimators=50)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ def test_serialize_model(self):
if LooseVersion(sklearn.__version__) >= "0.22":
fixture_parameters.update({"ccp_alpha": "0.0"})
fixture_parameters.move_to_end("ccp_alpha", last=False)
if LooseVersion(sklearn.__version__) >= "0.24":
del fixture_parameters["presort"]

structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []}

Expand Down Expand Up @@ -1317,12 +1319,18 @@ def test__get_fn_arguments_with_defaults(self):
(sklearn.tree.DecisionTreeClassifier.__init__, 14),
(sklearn.pipeline.Pipeline.__init__, 2),
]
else:
elif sklearn_version < "0.24":
fns = [
(sklearn.ensemble.RandomForestRegressor.__init__, 18),
(sklearn.tree.DecisionTreeClassifier.__init__, 14),
(sklearn.pipeline.Pipeline.__init__, 2),
]
else:
fns = [
(sklearn.ensemble.RandomForestRegressor.__init__, 18),
(sklearn.tree.DecisionTreeClassifier.__init__, 13),
(sklearn.pipeline.Pipeline.__init__, 2),
]

for fn, num_params_with_defaults in fns:
defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn)
Expand Down Expand Up @@ -1523,7 +1531,7 @@ def test_obtain_parameter_values(self):
"bootstrap": [True, False],
"criterion": ["gini", "entropy"],
},
cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1),
cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1, shuffle=True),
n_iter=5,
)
flow = self.extension.model_to_flow(model)
Expand Down
12 changes: 10 additions & 2 deletions tests/test_flows/test_flow_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,8 +325,16 @@ def test_get_flow_reinstantiate_model_wrong_version(self):
# Note that CI does not test against 0.19.1.
openml.config.server = self.production_server
_, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3]
flow = 8175
expected = "Trying to deserialize a model with dependency" " sklearn==0.19.1 not satisfied."
if sklearn_major > 23:
flow = 18587 # 18687, 18725 --- flows building random forest on >= 0.23
flow_sklearn_version = "0.23.1"
else:
flow = 8175
flow_sklearn_version = "0.19.1"
expected = (
"Trying to deserialize a model with dependency "
"sklearn=={} not satisfied.".format(flow_sklearn_version)
)
self.assertRaisesRegex(
ValueError, expected, openml.flows.get_flow, flow_id=flow, reinstantiate=True
)
Expand Down
13 changes: 6 additions & 7 deletions tests/test_study/test_study_examples.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# License: BSD 3-Clause

from openml.testing import TestBase, SimpleImputer, CustomImputer
from openml.testing import TestBase
from openml.extensions.sklearn import cat, cont

import sklearn
Expand All @@ -13,8 +13,8 @@ class TestStudyFunctions(TestBase):
"""Test the example code of Bischl et al. (2018)"""

@unittest.skipIf(
LooseVersion(sklearn.__version__) < "0.20",
reason="columntransformer introduction in 0.20.0",
LooseVersion(sklearn.__version__) < "0.24",
reason="columntransformer introduction in 0.24.0",
)
def test_Figure1a(self):
"""Test listing in Figure 1a on a single task and the old OpenML100 study.
Expand All @@ -39,15 +39,14 @@ def test_Figure1a(self):
import openml
import sklearn.metrics
import sklearn.tree
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

benchmark_suite = openml.study.get_study("OpenML100", "tasks") # obtain the benchmark suite
cat_imp = make_pipeline(
SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
)
cont_imp = make_pipeline(CustomImputer(), StandardScaler())
cat_imp = OneHotEncoder(handle_unknown="ignore")
cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
clf = Pipeline(
steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
Expand Down

0 comments on commit d2945ba

Please sign in to comment.