From b3b78672db0286eda770643ee10a28801e18dd31 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 7 Jul 2020 22:13:08 +0200
Subject: [PATCH 01/36] Create first section: Creating Custom Flow

---
 examples/30_extended/custom_flow_tutorial.py | 96 ++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 examples/30_extended/custom_flow_tutorial.py

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
new file mode 100644
index 000000000..c72cd949b
--- /dev/null
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -0,0 +1,96 @@
+"""
+================================
+Creating and Using a Custom Flow
+================================
+
+The most convenient way to create a flow for your machine learning workflow is to generate it
+automatically as described in <>. However, there are scenarios where this is not possible, such
+as when the flow uses a framework without an extension or when the flow is described by a script.
+
+In those cases you can still create a custom flow by following the steps of this tutorial.
+As an example we will use the flows generated for the AutoML Benchmark (...),
+and also show how to link runs to the custom flow.
+"""
+
+####################################################################################################
+
+# License: BSD 3-Clause
+# .. warning:: This example uploads data. For that reason, this example
+#   connects to the test server at test.openml.org. This prevents the main
+#   server from crowding with example datasets, tasks, runs, and so on.
+from collections import OrderedDict
+
+import openml
+
+openml.config.start_using_configuration_for_example()
+
+####################################################################################################
+# 1. Defining the flow
+# ====================
+# The first step is to define all the hyperparameters of your flow.
+# Check ... for the descriptions of each variable.
+# Note that `external version` and `name` together should uniquely identify a flow.
+#
+# The AutoML Benchmark runs AutoML systems across a range of tasks.
+# We can not use the flows of the AutoML systems directly, as the benchmark adds performs
+# preprocessing as required.
+#
+# We will break down the flow parameters into several groups, for the tutorial.
+# First we will define the name and version information.
+# Make sure to leave enough information so others can determine exactly which
+# version of the package/script is used. Use tags so users can find your flow easily.
+
+general = dict(
+    name="automlbenchmark_autosklearn",
+    description=(
+        "Auto-sklearn as set up by the AutoML Benchmark"
+        "Source: https://github.com/openml/automlbenchmark/releases/tag/v0.9"
+    ),
+    external_version="amlb==0.9",
+    language="English",
+    tags=["amlb", "benchmark", "study_218"],
+    dependencies="amlb==0.9",
+)
+
+####################################################################################################
+# Next we define the flow hyperparameters. We define their name and default value in `parameters`,
+# and provide meta-data for each parameter through `parameters_meta_info`.
+# Note that the use of ordered dicts is required.
+
+flow_hyperparameters = dict(
+    parameters=OrderedDict(time="240", memory="32", cores="8"),
+    parameters_meta_info=OrderedDict(
+        cores=OrderedDict(description="number of available cores", data_type="int"),
+        memory=OrderedDict(description="memory in gigabytes", data_type="int"),
+        time=OrderedDict(description="time in minutes", data_type="int"),
+    ),
+)
+
+####################################################################################################
+# It is possible for flows to contain subflows. In this example, the auto-sklearn flow is a
+# subflow, this means that the subflow is entirely executed as part of this flow.
+# Using this modularity also allows your runs to specify which hyperparameters of the
+# subflows were used!
+#
+# Note: flow 15275 is not actually the right flow on the test server,
+# but that does not matter for this demonstration.
+
+autosklearn_flow = openml.flows.get_flow(15275)  # auto-sklearn 0.5.1
+subflow = dict(components=OrderedDict(automl_tool=autosklearn_flow),)
+
+####################################################################################################
+# With all parameters of the flow defined, we can now initialize the OpenMLFlow and publish.
+# Explicitly set the model of the flow to `None`, because we provided all the details already!
+
+autosklearn_amlb_flow = openml.flows.OpenMLFlow(
+    **general, **flow_hyperparameters, **subflow, model=None,
+)
+autosklearn_amlb_flow.publish()
+print(f"autosklearn flow created: {autosklearn_amlb_flow.flow_id}")
+# for dev purposes, since we're rerunning this often, we want to double-check no new flows are created
+
+####################################################################################################
+# 2. Using the flow
+# ====================
+# This Section will show how to upload run data for your custom flow.
+#

From 19d79d7668ec9ad053a7f6f67689ffce99326667 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 7 Jul 2020 22:27:35 +0200
Subject: [PATCH 02/36] Add Section: Using the Flow

It is incomplete as while trying to explain how to format the
predictions, I realized a utility function is required.
---
 examples/30_extended/custom_flow_tutorial.py | 41 +++++++++++++++++++-
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index c72cd949b..ca4ca9f8c 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -87,10 +87,47 @@
 )
 autosklearn_amlb_flow.publish()
 print(f"autosklearn flow created: {autosklearn_amlb_flow.flow_id}")
-# for dev purposes, since we're rerunning this often, we want to double-check no new flows are created
 
 ####################################################################################################
 # 2. Using the flow
 # ====================
 # This Section will show how to upload run data for your custom flow.
-#
+# Take care to change the values of parameters as well as the task id,
+# to reflect the actual run.
+# Task and parameter values in the example are fictional.
+
+flow_id = autosklearn_amlb_flow.flow_id
+
+parameters = [
+    OrderedDict([("oml:name", "cores"), ("oml:value", 4), ("oml:component", flow_id)]),
+    OrderedDict([("oml:name", "memory"), ("oml:value", 16), ("oml:component", flow_id)]),
+    OrderedDict([("oml:name", "time"), ("oml:value", 120), ("oml:component", flow_id)]),
+]
+
+task_id = 115
+task = openml.tasks.get_task(task_id)  # Diabetes Task
+dataset_id = task.get_dataset().dataset_id
+
+
+####################################################################################################
+# The last bit of information for the run we need are the predicted values.
+# The exact format of the predictions will depend on the task.
+# [... add later, this clearly seems too complicated to expected users to do]
+
+predictions = []  #  load_format_predictions(task_id, predictions)
+
+####################################################################################################
+# Finally we can create the OpenMLRun object and upload.
+# We use the "setup string" because the used flow was a script.
+
+benchmark_command = f"python3 runbenchmark.py auto-sklearn medium -m aws -t 119"
+my_run = openml.runs.OpenMLRun(
+    task_id=task_id,
+    flow_id=flow_id,
+    dataset_id=dataset_id,
+    parameter_settings=parameters,
+    setup_string=benchmark_command,
+    data_content=predictions,
+    tags=["study_218"],
+)
+my_run.publish()

From 208f6cd65ee05556080bc7237f5cf1f2f9005be1 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 10 Jul 2020 10:53:37 +0200
Subject: [PATCH 03/36] Allow run description text to be custom

Previously the description text that accompanies the prediction file was
auto-generated with the assumption that the corresponding flow had an
extension. To support custom flows (with no extension), this behavior
had to be changed. The description can now be passed on initialization.
The description describing it was auto generated from run_task is now
correctly only added if the run was generated through run_flow_on_task.
---
 openml/runs/functions.py | 60 +++++++++++++++++++++++++++++++++++++++-
 openml/runs/run.py       | 15 +++++-----
 2 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index b3b15d16e..ba67e1a8c 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -4,6 +4,7 @@
 import io
 import itertools
 import os
+import time
 from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING  # noqa F401
 import warnings
 
@@ -250,7 +251,8 @@ def run_flow_on_task(
     )
 
     data_content, trace, fold_evaluations, sample_evaluations = res
-
+    fields = [*run_environment, time.strftime("%c"), "Created by run_flow_on_task"]
+    generated_description = "\n".join(fields)
     run = OpenMLRun(
         task_id=task.task_id,
         flow_id=flow_id,
@@ -262,6 +264,7 @@ def run_flow_on_task(
         data_content=data_content,
         flow=flow,
         setup_string=flow.extension.create_setup_string(flow.model),
+        description_text=generated_description,
     )
 
     if (upload_flow or avoid_duplicate_runs) and flow.flow_id is not None:
@@ -1004,3 +1007,58 @@ def __list_runs(api_call, output_format="dict"):
         runs = pd.DataFrame.from_dict(runs, orient="index")
 
     return runs
+
+
+def format_prediction(
+    task: OpenMLSupervisedTask,
+    repeat: int,
+    fold: int,
+    index: int,
+    prediction: Union[str, int, float],
+    truth: Union[str, int, float],
+    sample: Optional[int] = None,
+    proba: Optional[Dict[str, float]] = None,
+) -> List[Union[str, int, float]]:
+    """ Format the predictions in the specific order as required for the run results.
+
+    Parameters
+    ----------
+    task: OpenMLSupervisedTask
+        Task for which to format the predictions.
+    repeat: int
+        From which repeat this predictions is made.
+    fold: int
+        From which fold this prediction is made.
+    index: int
+        For which index this prediction is made.
+    prediction: str, int or float
+        The predicted class label or value.
+    truth: str, int or float
+        The true class label or value.
+    sample: int, optional (default=None)
+        From which sample set this prediction is made.
+        Required only for LearningCurve tasks.
+    proba: Dict[str, float], optional (default=None)
+        For classification tasks only.
+        A mapping from each class label to their predicted probability.
+        The dictionary should contain an entry for each of the `task.class_labels`.
+        E.g.: {"Iris-Setosa": 0.2, "Iris-Versicolor": 0.7, "Iris-Virginica": 0.1}
+
+    Returns
+    -------
+    A list with elements for the prediction results of a run.
+
+    """
+    if isinstance(task, OpenMLClassificationTask):
+        if proba is None:
+            raise ValueError("Predicted Class Probabilities are required for classification task")
+        if not set(task.class_labels) == set(proba):
+            raise ValueError("Each class should have a predicted probability")
+        if sample is None:
+            if isinstance(task, OpenMLLearningCurveTask):
+                raise ValueError("`sample` can not be none for LearningCurveTask")
+            else:
+                sample = 0
+        probabilities = [proba[c] for c in task.class_labels]
+        return [repeat, fold, sample, index, *probabilities, truth, prediction]
+    return [repeat, fold, index, truth, prediction]
diff --git a/openml/runs/run.py b/openml/runs/run.py
index a61fc4688..a32907156 100644
--- a/openml/runs/run.py
+++ b/openml/runs/run.py
@@ -62,6 +62,7 @@ def __init__(
         task=None,
         flow=None,
         run_id=None,
+        description_text=None,
     ):
         self.uploader = uploader
         self.uploader_name = uploader_name
@@ -87,6 +88,7 @@ def __init__(
         self.model = model
         self.tags = tags
         self.predictions_url = predictions_url
+        self.description_text = description_text
 
     @property
     def id(self) -> Optional[int]:
@@ -264,16 +266,13 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
         if self.flow is None:
             self.flow = get_flow(self.flow_id)
 
-        run_environment = (
-            self.flow.extension.get_version_information()
-            + [time.strftime("%c")]
-            + ["Created by run_task()"]
-        )
+        if self.description_text is None:
+            self.description_text = time.strftime("%c")
         task = get_task(self.task_id)
 
         arff_dict = OrderedDict()  # type: 'OrderedDict[str, Any]'
         arff_dict["data"] = self.data_content
-        arff_dict["description"] = "\n".join(run_environment)
+        arff_dict["description"] = self.description_text
         arff_dict["relation"] = "openml_task_{}_predictions".format(task.task_id)
 
         if isinstance(task, OpenMLLearningCurveTask):
@@ -485,9 +484,9 @@ def _get_file_elements(self) -> Dict:
         Derived child classes should overwrite this method as necessary.
         The description field will be populated automatically if not provided.
         """
-        if self.model is None:
+        if self.parameter_settings is None and self.model is None:
             raise PyOpenMLError(
-                "OpenMLRun obj does not contain a model. " "(This should never happen.) "
+                "OpenMLRun must contain a model or be initialized with parameter_settings."
             )
         if self.flow_id is None:
             if self.flow is None:

From 2247bbc2ac20e46b2e31e7664bf706ec21b56246 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 10 Jul 2020 10:57:16 +0200
Subject: [PATCH 04/36] Draft for Custom Flow tutorial

---
 examples/30_extended/custom_flow_tutorial.py | 58 ++++++++++++++++++--
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index ca4ca9f8c..33aaaf1a1 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -19,8 +19,11 @@
 #   connects to the test server at test.openml.org. This prevents the main
 #   server from crowding with example datasets, tasks, runs, and so on.
 from collections import OrderedDict
+import numpy as np
 
 import openml
+from openml import OpenMLClassificationTask
+from openml.runs.functions import format_prediction
 
 openml.config.start_using_configuration_for_example()
 
@@ -104,17 +107,62 @@
     OrderedDict([("oml:name", "time"), ("oml:value", 120), ("oml:component", flow_id)]),
 ]
 
-task_id = 115
-task = openml.tasks.get_task(task_id)  # Diabetes Task
+task_id = 1408  # Iris Task
+task = openml.tasks.get_task(task_id)
 dataset_id = task.get_dataset().dataset_id
 
 
 ####################################################################################################
 # The last bit of information for the run we need are the predicted values.
 # The exact format of the predictions will depend on the task.
-# [... add later, this clearly seems too complicated to expected users to do]
+# The predictions should always be a list of lists, each list should contain:
+# - the repeat number: for repeated evaluation strategies. (e.g. repeated cross-validation)
+# - the fold number: for cross-validation. (what should this be for holdout?)
+# - 0: this field is for backward compatibility.
+# - index: the row (of the original dataset) for which the prediction was made.
+# - p_1, ..., p_c: for each class the predicted probability of the sample
+#   belonging to that class. (no elements for regression tasks)
+#   Make sure the order of these elements follows the order of `task.class_labels`.
+# - the predicted class/value for the sample
+# - the true class/value for the sample
+#
+# Here we generated some random predictions in place.
+# You can ignore this code, or use it to better understand the formatting of the predictions.
+# Find the repeats/folds/samples for this task:
+n_repeats, n_folds, _ = task.get_split_dimensions()
+all_test_indices = [
+    (repeat, fold, 0, index)
+    for repeat in range(n_repeats)
+    for fold in range(n_folds)
+    for index in task.get_train_test_split_indices(fold, repeat)[1]
+]
 
-predictions = []  #  load_format_predictions(task_id, predictions)
+# random class probabilities (Iris has 150 samples and 3 classes):
+r = np.random.rand(150 * n_repeats, 3)
+# scale the random values so that the probabilities of each sample sum to 1:
+y_proba = r / r.sum(axis=1).reshape(-1, 1)
+y_pred = y_proba.argmax(axis=1)
+class_map = dict(zip(range(3), task.class_labels))
+y_true = ["Iris-setosa"] * 50 + ["Iris-versicolor"] * 50 + ["Iris-virginica"] * 50
+
+predictions = []
+ps = []
+
+for where, y, yp, proba in zip(all_test_indices, y_true, y_pred, y_proba):
+    predictions.append([*where, *proba, class_map[yp], y])
+    repeat, fold, sample, index = where
+
+    p = format_prediction(
+        task=task,
+        repeat=repeat,
+        fold=fold,
+        sample=sample,
+        index=index,
+        prediction=class_map[yp],
+        truth=y,
+        proba={c: pb for (c, pb) in zip(task.class_labels, proba)},
+    )
+    ps.append(p)
 
 ####################################################################################################
 # Finally we can create the OpenMLRun object and upload.
@@ -129,5 +177,7 @@
     setup_string=benchmark_command,
     data_content=predictions,
     tags=["study_218"],
+    description_text="Run generated by the Custom Flow tutorial.",
 )
 my_run.publish()
+print("run created:", my_run.run_id)

From 326510ccc0afe30d200ccbae61ce0f93460c0009 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 10 Jul 2020 17:54:54 +0200
Subject: [PATCH 05/36] Add minimal docstring to OpenMLRun

I am not for each field what the specifications are.
---
 openml/runs/run.py | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/openml/runs/run.py b/openml/runs/run.py
index a32907156..b8be9c3a3 100644
--- a/openml/runs/run.py
+++ b/openml/runs/run.py
@@ -27,14 +27,37 @@
 class OpenMLRun(OpenMLBase):
     """OpenML Run: result of running a model on an openml dataset.
 
-       Parameters
-       ----------
-       task_id : int
-           Refers to the task.
-       flow_id : int
-           Refers to the flow.
-       dataset_id: int
-           Refers to the data.
+    Parameters
+    ----------
+    task_id: int
+    flow_id: int
+    dataset_id: int
+    setup_string: str
+    output_files: Dict[str, str]
+        A dictionary that specifies where each related file can be found.
+    setup_id: int
+    tags: List[str]
+    uploader: int
+        User ID of the uploader.
+    uploader_name: str
+    evaluations: Dict
+    fold_evaluations: Dict
+    sample_evaluations: Dict
+    data_content: List[List]
+        The predictions generated from executing this run.
+    trace: OpenMLRunTrace
+    model: object
+    task_type: str
+    task_evaluation_measure: str
+    flow_name: str
+    parameter_settings: List[OrderedDict]
+    predictions_url: str
+    task: OpenMLTask
+    flow: OpenMLFlow
+    run_id: int
+    description_text: str, optional
+        Description text to add to the predictions file.
+        If left None,
     """
 
     def __init__(

From 872bd7596788264990bf5b0665e00666f0e91bd1 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 10 Jul 2020 18:11:54 +0200
Subject: [PATCH 06/36] Process code review feedback

In particular:
 - text changes
 - fetch true labels from the dataset instead
---
 examples/30_extended/custom_flow_tutorial.py | 40 ++++++++++++--------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index 33aaaf1a1..cb498a575 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -32,13 +32,13 @@
 # ====================
 # The first step is to define all the hyperparameters of your flow.
 # Check ... for the descriptions of each variable.
-# Note that `external version` and `name` together should uniquely identify a flow.
+# Note that `external version` and `name` together uniquely identify a flow.
 #
 # The AutoML Benchmark runs AutoML systems across a range of tasks.
-# We can not use the flows of the AutoML systems directly, as the benchmark adds performs
-# preprocessing as required.
+# OpenML stores Flows for each AutoML system. However, the AutoML benchmark adds
+# preprocessing to the flow, so should be described in a new flow.
 #
-# We will break down the flow parameters into several groups, for the tutorial.
+# We will break down the flow arguments into several groups, for the tutorial.
 # First we will define the name and version information.
 # Make sure to leave enough information so others can determine exactly which
 # version of the package/script is used. Use tags so users can find your flow easily.
@@ -57,7 +57,7 @@
 
 ####################################################################################################
 # Next we define the flow hyperparameters. We define their name and default value in `parameters`,
-# and provide meta-data for each parameter through `parameters_meta_info`.
+# and provide meta-data for each hyperparameter through `parameters_meta_info`.
 # Note that the use of ordered dicts is required.
 
 flow_hyperparameters = dict(
@@ -74,6 +74,7 @@
 # subflow, this means that the subflow is entirely executed as part of this flow.
 # Using this modularity also allows your runs to specify which hyperparameters of the
 # subflows were used!
+# Using a subflow is not required.
 #
 # Note: flow 15275 is not actually the right flow on the test server,
 # but that does not matter for this demonstration.
@@ -115,6 +116,7 @@
 ####################################################################################################
 # The last bit of information for the run we need are the predicted values.
 # The exact format of the predictions will depend on the task.
+#
 # The predictions should always be a list of lists, each list should contain:
 # - the repeat number: for repeated evaluation strategies. (e.g. repeated cross-validation)
 # - the fold number: for cross-validation. (what should this be for holdout?)
@@ -126,12 +128,18 @@
 # - the predicted class/value for the sample
 # - the true class/value for the sample
 #
+# When using openml-python extensions (such as through `run_model_on_task`),
+# all of this formatting is automatic.
+# Unfortunately we can not automate this procedure for custom flows,
+# which means a little additional effort is required.
+#
 # Here we generated some random predictions in place.
 # You can ignore this code, or use it to better understand the formatting of the predictions.
-# Find the repeats/folds/samples for this task:
+#
+# Find the repeats/folds for this task:
 n_repeats, n_folds, _ = task.get_split_dimensions()
 all_test_indices = [
-    (repeat, fold, 0, index)
+    (repeat, fold, index)
     for repeat in range(n_repeats)
     for fold in range(n_folds)
     for index in task.get_train_test_split_indices(fold, repeat)[1]
@@ -142,31 +150,31 @@
 # scale the random values so that the probabilities of each sample sum to 1:
 y_proba = r / r.sum(axis=1).reshape(-1, 1)
 y_pred = y_proba.argmax(axis=1)
+
 class_map = dict(zip(range(3), task.class_labels))
-y_true = ["Iris-setosa"] * 50 + ["Iris-versicolor"] * 50 + ["Iris-virginica"] * 50
+_, y_true = task.get_X_and_y()
+y_true = [class_map[y] for y in y_true]
 
+# We format the predictions with the utility function `format_prediction`.
+# It will organize the relevant data in the expected format/order.
 predictions = []
-ps = []
-
 for where, y, yp, proba in zip(all_test_indices, y_true, y_pred, y_proba):
-    predictions.append([*where, *proba, class_map[yp], y])
-    repeat, fold, sample, index = where
+    repeat, fold, index = where
 
-    p = format_prediction(
+    prediction = format_prediction(
         task=task,
         repeat=repeat,
         fold=fold,
-        sample=sample,
         index=index,
         prediction=class_map[yp],
         truth=y,
         proba={c: pb for (c, pb) in zip(task.class_labels, proba)},
     )
-    ps.append(p)
+    predictions.append(prediction)
 
 ####################################################################################################
 # Finally we can create the OpenMLRun object and upload.
-# We use the "setup string" because the used flow was a script.
+# We use the argument setup_string because the used flow was a script."
 
 benchmark_command = f"python3 runbenchmark.py auto-sklearn medium -m aws -t 119"
 my_run = openml.runs.OpenMLRun(

From c3a53260421258c410d6c1bb6bce52bede3b7785 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 10 Jul 2020 18:22:19 +0200
Subject: [PATCH 07/36] Use the format utility function in automatic runs

To format the predictions.
---
 openml/runs/functions.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index ba67e1a8c..77b803c6c 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -481,13 +481,17 @@ def _calculate_local_measure(sklearn_fn, openml_name):
 
             for i, tst_idx in enumerate(test_indices):
 
-                arff_line = [rep_no, fold_no, sample_no, tst_idx]  # type: List[Any]
                 if task.class_labels is not None:
-                    for j, class_label in enumerate(task.class_labels):
-                        arff_line.append(proba_y[i][j])
-
-                    arff_line.append(task.class_labels[pred_y[i]])
-                    arff_line.append(task.class_labels[test_y[i]])
+                    arff_line = format_prediction(
+                        task=task,
+                        repeat=rep_no,
+                        fold=fold_no,
+                        sample=sample_no,
+                        index=tst_idx,
+                        prediction=task.class_labels[pred_y[i]],
+                        truth=task.class_labels[test_y[i]],
+                        proba=dict(zip(task.class_labels, proba_y[i])),
+                    )
                 else:
                     raise ValueError("The task has no class labels")
 
@@ -501,7 +505,15 @@ def _calculate_local_measure(sklearn_fn, openml_name):
         elif isinstance(task, OpenMLRegressionTask):
 
             for i in range(0, len(test_indices)):
-                arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y[i]]
+                arff_line = format_prediction(
+                    task=task,
+                    repeat=rep_no,
+                    fold=fold_no,
+                    index=test_indices[i],
+                    prediction=pred_y[i],
+                    truth=test_y[i],
+                )
+
                 arff_datacontent.append(arff_line)
 
             if add_local_measures:

From a7cb290ada4281917885a3e22316e39d4860a067 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Mon, 13 Jul 2020 13:35:54 +0200
Subject: [PATCH 08/36] Process @mfeurer feedback

---
 examples/30_extended/custom_flow_tutorial.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index cb498a575..70c4adfb8 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -58,7 +58,8 @@
 ####################################################################################################
 # Next we define the flow hyperparameters. We define their name and default value in `parameters`,
 # and provide meta-data for each hyperparameter through `parameters_meta_info`.
-# Note that the use of ordered dicts is required.
+# Note that eventhough the argument name is `parameters` they describe the hyperparameters.
+# The use of ordered dicts is required.
 
 flow_hyperparameters = dict(
     parameters=OrderedDict(time="240", memory="32", cores="8"),
@@ -84,7 +85,13 @@
 
 ####################################################################################################
 # With all parameters of the flow defined, we can now initialize the OpenMLFlow and publish.
-# Explicitly set the model of the flow to `None`, because we provided all the details already!
+# Because we provided all the details already, we do not need to provide a `model` to the flow.
+#
+# In our case, we don't even have a model. It is possible to have a model but still require
+# to follow these steps when the model (python object) does not have an extensions from which
+# to automatically extract the hyperparameters.
+# So whether you have a model with no extension or no model at all, explicitly set
+# the model of the flow to `None`.
 
 autosklearn_amlb_flow = openml.flows.OpenMLFlow(
     **general, **flow_hyperparameters, **subflow, model=None,
@@ -174,7 +181,7 @@
 
 ####################################################################################################
 # Finally we can create the OpenMLRun object and upload.
-# We use the argument setup_string because the used flow was a script."
+# We use the argument setup_string because the used flow was a script.
 
 benchmark_command = f"python3 runbenchmark.py auto-sklearn medium -m aws -t 119"
 my_run = openml.runs.OpenMLRun(

From e5dcaf01b100f7712a00a6f254e2ee7737930002 Mon Sep 17 00:00:00 2001
From: Bilgecelik <38037323+Bilgecelik@users.noreply.github.com>
Date: Tue, 14 Jul 2020 12:20:07 +0200
Subject: [PATCH 09/36] Rename arguments of list_evaluations (#933)

* list evals name change

* list evals - update
---
 .../30_extended/fetch_evaluations_tutorial.py |   6 +-
 .../plot_svm_hyperparameters_tutorial.py      |   4 +-
 examples/40_paper/2018_ida_strang_example.py  |   2 +-
 examples/40_paper/2018_kdd_rijn_example.py    |   4 +-
 .../40_paper/2018_neurips_perrone_example.py  |   6 +-
 openml/evaluations/functions.py               | 138 +++++++++---------
 .../test_evaluation_functions.py              |  32 ++--
 .../test_evaluations_example.py               |   4 +-
 8 files changed, 100 insertions(+), 96 deletions(-)

diff --git a/examples/30_extended/fetch_evaluations_tutorial.py b/examples/30_extended/fetch_evaluations_tutorial.py
index de636e074..2823eabf3 100644
--- a/examples/30_extended/fetch_evaluations_tutorial.py
+++ b/examples/30_extended/fetch_evaluations_tutorial.py
@@ -63,7 +63,7 @@
 
 metric = "predictive_accuracy"
 evals = openml.evaluations.list_evaluations(
-    function=metric, task=[task_id], output_format="dataframe"
+    function=metric, tasks=[task_id], output_format="dataframe"
 )
 # Displaying the first 10 rows
 print(evals.head(n=10))
@@ -162,7 +162,7 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
 # List evaluations in descending order based on predictive_accuracy with
 # hyperparameters
 evals_setups = openml.evaluations.list_evaluations_setups(
-    function="predictive_accuracy", task=[31], size=100, sort_order="desc"
+    function="predictive_accuracy", tasks=[31], size=100, sort_order="desc"
 )
 
 ""
@@ -173,7 +173,7 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
 # with hyperparameters. parameters_in_separate_columns returns parameters in
 # separate columns
 evals_setups = openml.evaluations.list_evaluations_setups(
-    function="predictive_accuracy", flow=[6767], size=100, parameters_in_separate_columns=True
+    function="predictive_accuracy", flows=[6767], size=100, parameters_in_separate_columns=True
 )
 
 ""
diff --git a/examples/30_extended/plot_svm_hyperparameters_tutorial.py b/examples/30_extended/plot_svm_hyperparameters_tutorial.py
index aac84bcd4..e366c56df 100644
--- a/examples/30_extended/plot_svm_hyperparameters_tutorial.py
+++ b/examples/30_extended/plot_svm_hyperparameters_tutorial.py
@@ -20,8 +20,8 @@
 # uploaded runs (called *setup*).
 df = openml.evaluations.list_evaluations_setups(
     function="predictive_accuracy",
-    flow=[8353],
-    task=[6],
+    flows=[8353],
+    tasks=[6],
     output_format="dataframe",
     # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise,
     # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary.
diff --git a/examples/40_paper/2018_ida_strang_example.py b/examples/40_paper/2018_ida_strang_example.py
index 74c6fde5f..687d973c2 100644
--- a/examples/40_paper/2018_ida_strang_example.py
+++ b/examples/40_paper/2018_ida_strang_example.py
@@ -47,7 +47,7 @@
 
 # Downloads all evaluation records related to this study
 evaluations = openml.evaluations.list_evaluations(
-    measure, flow=flow_ids, study=study_id, output_format="dataframe"
+    measure, flows=flow_ids, study=study_id, output_format="dataframe"
 )
 # gives us a table with columns data_id, flow1_value, flow2_value
 evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna()
diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py
index e5d998e35..752419ea3 100644
--- a/examples/40_paper/2018_kdd_rijn_example.py
+++ b/examples/40_paper/2018_kdd_rijn_example.py
@@ -88,8 +88,8 @@
     # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
     evals = openml.evaluations.list_evaluations_setups(
         evaluation_measure,
-        flow=[flow_id],
-        task=[task_id],
+        flows=[flow_id],
+        tasks=[task_id],
         size=limit_per_task,
         output_format="dataframe",
     )
diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py
index 8639e0a3a..60d212116 100644
--- a/examples/40_paper/2018_neurips_perrone_example.py
+++ b/examples/40_paper/2018_neurips_perrone_example.py
@@ -91,9 +91,9 @@ def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_cu
     # Fetching evaluations
     eval_df = openml.evaluations.list_evaluations_setups(
         function=metric,
-        task=task_ids,
-        flow=[flow_id],
-        uploader=[2702],
+        tasks=task_ids,
+        flows=[flow_id],
+        uploaders=[2702],
         output_format="dataframe",
         parameters_in_separate_columns=True,
     )
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index adaf419ef..4c17f8ce7 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -17,11 +17,11 @@ def list_evaluations(
     function: str,
     offset: Optional[int] = None,
     size: Optional[int] = None,
-    task: Optional[List] = None,
-    setup: Optional[List] = None,
-    flow: Optional[List] = None,
-    run: Optional[List] = None,
-    uploader: Optional[List] = None,
+    tasks: Optional[List[Union[str, int]]] = None,
+    setups: Optional[List[Union[str, int]]] = None,
+    flows: Optional[List[Union[str, int]]] = None,
+    runs: Optional[List[Union[str, int]]] = None,
+    uploaders: Optional[List[Union[str, int]]] = None,
     tag: Optional[str] = None,
     study: Optional[int] = None,
     per_fold: Optional[bool] = None,
@@ -41,17 +41,18 @@ def list_evaluations(
     size : int, optional
         the maximum number of runs to show
 
-    task : list, optional
-
-    setup: list, optional
-
-    flow : list, optional
-
-    run : list, optional
-
-    uploader : list, optional
-
+    tasks : list[int,str], optional
+        the list of task IDs
+    setups: list[int,str], optional
+        the list of setup IDs
+    flows : list[int,str], optional
+        the list of flow IDs
+    runs :list[int,str], optional
+        the list of run IDs
+    uploaders : list[int,str], optional
+        the list of uploader IDs
     tag : str, optional
+        filter evaluation based on given tag
 
     study : int, optional
 
@@ -85,11 +86,11 @@ def list_evaluations(
         function=function,
         offset=offset,
         size=size,
-        task=task,
-        setup=setup,
-        flow=flow,
-        run=run,
-        uploader=uploader,
+        tasks=tasks,
+        setups=setups,
+        flows=flows,
+        runs=runs,
+        uploaders=uploaders,
         tag=tag,
         study=study,
         sort_order=sort_order,
@@ -99,11 +100,11 @@ def list_evaluations(
 
 def _list_evaluations(
     function: str,
-    task: Optional[List] = None,
-    setup: Optional[List] = None,
-    flow: Optional[List] = None,
-    run: Optional[List] = None,
-    uploader: Optional[List] = None,
+    tasks: Optional[List] = None,
+    setups: Optional[List] = None,
+    flows: Optional[List] = None,
+    runs: Optional[List] = None,
+    uploaders: Optional[List] = None,
     study: Optional[int] = None,
     sort_order: Optional[str] = None,
     output_format: str = "object",
@@ -120,15 +121,16 @@ def _list_evaluations(
     function : str
         the evaluation function. e.g., predictive_accuracy
 
-    task : list, optional
-
-    setup: list, optional
-
-    flow : list, optional
-
-    run : list, optional
-
-    uploader : list, optional
+    tasks : list[int,str], optional
+        the list of task IDs
+    setups: list[int,str], optional
+        the list of setup IDs
+    flows : list[int,str], optional
+        the list of flow IDs
+    runs :list[int,str], optional
+        the list of run IDs
+    uploaders : list[int,str], optional
+        the list of uploader IDs
 
     study : int, optional
 
@@ -155,16 +157,16 @@ def _list_evaluations(
     if kwargs is not None:
         for operator, value in kwargs.items():
             api_call += "/%s/%s" % (operator, value)
-    if task is not None:
-        api_call += "/task/%s" % ",".join([str(int(i)) for i in task])
-    if setup is not None:
-        api_call += "/setup/%s" % ",".join([str(int(i)) for i in setup])
-    if flow is not None:
-        api_call += "/flow/%s" % ",".join([str(int(i)) for i in flow])
-    if run is not None:
-        api_call += "/run/%s" % ",".join([str(int(i)) for i in run])
-    if uploader is not None:
-        api_call += "/uploader/%s" % ",".join([str(int(i)) for i in uploader])
+    if tasks is not None:
+        api_call += "/task/%s" % ",".join([str(int(i)) for i in tasks])
+    if setups is not None:
+        api_call += "/setup/%s" % ",".join([str(int(i)) for i in setups])
+    if flows is not None:
+        api_call += "/flow/%s" % ",".join([str(int(i)) for i in flows])
+    if runs is not None:
+        api_call += "/run/%s" % ",".join([str(int(i)) for i in runs])
+    if uploaders is not None:
+        api_call += "/uploader/%s" % ",".join([str(int(i)) for i in uploaders])
     if study is not None:
         api_call += "/study/%d" % study
     if sort_order is not None:
@@ -276,11 +278,11 @@ def list_evaluations_setups(
     function: str,
     offset: Optional[int] = None,
     size: Optional[int] = None,
-    task: Optional[List] = None,
-    setup: Optional[List] = None,
-    flow: Optional[List] = None,
-    run: Optional[List] = None,
-    uploader: Optional[List] = None,
+    tasks: Optional[List] = None,
+    setups: Optional[List] = None,
+    flows: Optional[List] = None,
+    runs: Optional[List] = None,
+    uploaders: Optional[List] = None,
     tag: Optional[str] = None,
     per_fold: Optional[bool] = None,
     sort_order: Optional[str] = None,
@@ -299,15 +301,15 @@ def list_evaluations_setups(
         the number of runs to skip, starting from the first
     size : int, optional
         the maximum number of runs to show
-    task : list[int], optional
+    tasks : list[int], optional
         the list of task IDs
-    setup: list[int], optional
+    setups: list[int], optional
         the list of setup IDs
-    flow : list[int], optional
+    flows : list[int], optional
         the list of flow IDs
-    run : list[int], optional
+    runs : list[int], optional
         the list of run IDs
-    uploader : list[int], optional
+    uploaders : list[int], optional
         the list of uploader IDs
     tag : str, optional
         filter evaluation based on given tag
@@ -327,7 +329,7 @@ def list_evaluations_setups(
     -------
     dict or dataframe with hyperparameter settings as a list of tuples.
     """
-    if parameters_in_separate_columns and (flow is None or len(flow) != 1):
+    if parameters_in_separate_columns and (flows is None or len(flows) != 1):
         raise ValueError(
             "Can set parameters_in_separate_columns to true " "only for single flow_id"
         )
@@ -337,11 +339,11 @@ def list_evaluations_setups(
         function=function,
         offset=offset,
         size=size,
-        run=run,
-        task=task,
-        setup=setup,
-        flow=flow,
-        uploader=uploader,
+        runs=runs,
+        tasks=tasks,
+        setups=setups,
+        flows=flows,
+        uploaders=uploaders,
         tag=tag,
         per_fold=per_fold,
         sort_order=sort_order,
@@ -359,24 +361,26 @@ def list_evaluations_setups(
         setup_chunks = np.array_split(
             ary=evals["setup_id"].unique(), indices_or_sections=((length - 1) // N) + 1
         )
-        setups = pd.DataFrame()
-        for setup in setup_chunks:
-            result = pd.DataFrame(openml.setups.list_setups(setup=setup, output_format="dataframe"))
+        setup_data = pd.DataFrame()
+        for setups in setup_chunks:
+            result = pd.DataFrame(
+                openml.setups.list_setups(setup=setups, output_format="dataframe")
+            )
             result.drop("flow_id", axis=1, inplace=True)
             # concat resulting setup chunks into single datframe
-            setups = pd.concat([setups, result], ignore_index=True)
+            setup_data = pd.concat([setup_data, result], ignore_index=True)
         parameters = []
         # Convert parameters of setup into list of tuples of (hyperparameter, value)
-        for parameter_dict in setups["parameters"]:
+        for parameter_dict in setup_data["parameters"]:
             if parameter_dict is not None:
                 parameters.append(
                     {param["full_name"]: param["value"] for param in parameter_dict.values()}
                 )
             else:
                 parameters.append({})
-        setups["parameters"] = parameters
+        setup_data["parameters"] = parameters
         # Merge setups with evaluations
-        df = pd.merge(evals, setups, on="setup_id", how="left")
+        df = pd.merge(evals, setup_data, on="setup_id", how="left")
 
     if parameters_in_separate_columns:
         df = pd.concat([df.drop("parameters", axis=1), df["parameters"].apply(pd.Series)], axis=1)
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index 6fcaea2d4..0127309a7 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -41,7 +41,7 @@ def test_evaluation_list_filter_task(self):
 
         task_id = 7312
 
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", task=[task_id])
+        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", tasks=[task_id])
 
         self.assertGreater(len(evaluations), 100)
         for run_id in evaluations.keys():
@@ -56,7 +56,7 @@ def test_evaluation_list_filter_uploader_ID_16(self):
 
         uploader_id = 16
         evaluations = openml.evaluations.list_evaluations(
-            "predictive_accuracy", uploader=[uploader_id], output_format="dataframe"
+            "predictive_accuracy", uploaders=[uploader_id], output_format="dataframe"
         )
         self.assertEqual(evaluations["uploader"].unique(), [uploader_id])
 
@@ -66,7 +66,7 @@ def test_evaluation_list_filter_uploader_ID_10(self):
         openml.config.server = self.production_server
 
         setup_id = 10
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", setup=[setup_id])
+        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", setups=[setup_id])
 
         self.assertGreater(len(evaluations), 50)
         for run_id in evaluations.keys():
@@ -81,7 +81,7 @@ def test_evaluation_list_filter_flow(self):
 
         flow_id = 100
 
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", flow=[flow_id])
+        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", flows=[flow_id])
 
         self.assertGreater(len(evaluations), 2)
         for run_id in evaluations.keys():
@@ -96,7 +96,7 @@ def test_evaluation_list_filter_run(self):
 
         run_id = 12
 
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", run=[run_id])
+        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", runs=[run_id])
 
         self.assertEqual(len(evaluations), 1)
         for run_id in evaluations.keys():
@@ -132,9 +132,9 @@ def test_evaluation_list_per_fold(self):
             "predictive_accuracy",
             size=size,
             offset=0,
-            task=task_ids,
-            flow=flow_ids,
-            uploader=uploader_ids,
+            tasks=task_ids,
+            flows=flow_ids,
+            uploaders=uploader_ids,
             per_fold=True,
         )
 
@@ -149,9 +149,9 @@ def test_evaluation_list_per_fold(self):
             "predictive_accuracy",
             size=size,
             offset=0,
-            task=task_ids,
-            flow=flow_ids,
-            uploader=uploader_ids,
+            tasks=task_ids,
+            flows=flow_ids,
+            uploaders=uploader_ids,
             per_fold=False,
         )
         for run_id in evaluations.keys():
@@ -164,11 +164,11 @@ def test_evaluation_list_sort(self):
         task_id = 6
         # Get all evaluations of the task
         unsorted_eval = openml.evaluations.list_evaluations(
-            "predictive_accuracy", offset=0, task=[task_id]
+            "predictive_accuracy", offset=0, tasks=[task_id]
         )
         # Get top 10 evaluations of the same task
         sorted_eval = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=size, offset=0, task=[task_id], sort_order="desc"
+            "predictive_accuracy", size=size, offset=0, tasks=[task_id], sort_order="desc"
         )
         self.assertEqual(len(sorted_eval), size)
         self.assertGreater(len(unsorted_eval), 0)
@@ -191,11 +191,11 @@ def test_list_evaluations_setups_filter_flow(self):
         openml.config.server = self.production_server
         flow_id = [405]
         size = 100
-        evals = self._check_list_evaluation_setups(flow=flow_id, size=size)
+        evals = self._check_list_evaluation_setups(flows=flow_id, size=size)
         # check if parameters in separate columns works
         evals_cols = openml.evaluations.list_evaluations_setups(
             "predictive_accuracy",
-            flow=flow_id,
+            flows=flow_id,
             size=size,
             sort_order="desc",
             output_format="dataframe",
@@ -209,4 +209,4 @@ def test_list_evaluations_setups_filter_task(self):
         openml.config.server = self.production_server
         task_id = [6]
         size = 121
-        self._check_list_evaluation_setups(task=task_id, size=size)
+        self._check_list_evaluation_setups(tasks=task_id, size=size)
diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py
index 61b6c359e..5715b570a 100644
--- a/tests/test_evaluations/test_evaluations_example.py
+++ b/tests/test_evaluations/test_evaluations_example.py
@@ -14,8 +14,8 @@ def test_example_python_paper(self):
 
         df = openml.evaluations.list_evaluations_setups(
             "predictive_accuracy",
-            flow=[8353],
-            task=[6],
+            flows=[8353],
+            tasks=[6],
             output_format="dataframe",
             parameters_in_separate_columns=True,
         )  # Choose an SVM flow, for example 8353, and a task.

From 16700507289c6eb3b9b2b664688eb817d2451b99 Mon Sep 17 00:00:00 2001
From: marcoslbueno <38478211+marcoslbueno@users.noreply.github.com>
Date: Tue, 14 Jul 2020 12:21:08 +0200
Subject: [PATCH 10/36] adding config file to user guide (#931)

* adding config file to user guide

* finished requested changes
---
 doc/usage.rst | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/doc/usage.rst b/doc/usage.rst
index 36c8584ff..d7ad0d523 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -29,6 +29,35 @@ the OpenML Python connector, followed up by a simple example.
 
 * `Introduction <examples/introduction_tutorial.html>`_
 
+~~~~~~~~~~~~~
+Configuration
+~~~~~~~~~~~~~
+
+The configuration file resides in a directory ``.openml`` in the home
+directory of the user and is called config. It consists of ``key = value`` pairs
+which are separated by newlines. The following keys are defined:
+
+* apikey:
+    * required to access the server. The `OpenML setup <https://openml.github.io/openml-python/master/examples/20_basic/introduction_tutorial.html#authentication>`_ describes how to obtain an API key.
+
+* server:
+    * default: ``http://www.openml.org``. Alternatively, use ``test.openml.org`` for the test server.
+
+* cachedir:
+    * if not given, will default to ``~/.openml/cache``
+
+* avoid_duplicate_runs:
+    * if set to ``True``, when ``run_flow_on_task`` or similar methods are called a lookup is performed to see if there already exists such a run on the server. If so, download those results instead.
+    * if not given, will default to ``True``.
+
+* connection_n_retries:
+    * number of connection retries.
+    * default: 2. Maximum number of retries: 20.
+
+* verbosity:
+    * 0: normal output
+    * 1: info output
+    * 2: debug output
 
 ~~~~~~~~~~~~
 Key concepts

From 9c93f5b06a9802ae283ccba9d36a5e426378494a Mon Sep 17 00:00:00 2001
From: Sahithya Ravi <44670788+sahithyaravi1493@users.noreply.github.com>
Date: Thu, 23 Jul 2020 13:08:52 +0200
Subject: [PATCH 11/36] Edit api (#935)

* version1

* minor fixes

* tests

* reformat code

* check new version

* remove get data

* code format

* review comments

* fix duplicate

* type annotate

* example

* tests for exceptions

* fix pep8

* black format
---
 doc/progress.rst                              |   2 +-
 examples/30_extended/datasets_tutorial.py     |  43 ++++-
 openml/datasets/functions.py                  | 148 ++++++++++++++++++
 tests/test_datasets/test_dataset_functions.py |  81 +++++++++-
 4 files changed, 269 insertions(+), 5 deletions(-)

diff --git a/doc/progress.rst b/doc/progress.rst
index 976c5c750..ef5ed6bae 100644
--- a/doc/progress.rst
+++ b/doc/progress.rst
@@ -8,7 +8,7 @@ Changelog
 
 0.11.0
 ~~~~~~
-
+* ADD #929: Add data edit API
 * FIX #873: Fixes an issue which resulted in incorrect URLs when printing OpenML objects after
   switching the server.
 * FIX #885: Logger no longer registered by default. Added utility functions to easily register
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
index d7971d0f1..40b35bbea 100644
--- a/examples/30_extended/datasets_tutorial.py
+++ b/examples/30_extended/datasets_tutorial.py
@@ -5,12 +5,13 @@
 
 How to list and download datasets.
 """
-############################################################################
+""
 
 # License: BSD 3-Clauses
 
 import openml
 import pandas as pd
+from openml.datasets.functions import edit_dataset, get_dataset
 
 ############################################################################
 # Exercise 0
@@ -42,9 +43,9 @@
 # * Find a dataset called 'eeg_eye_state'.
 # * Find all datasets with more than 50 classes.
 datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20)
-############################################################################
+""
 datalist.query('name == "eeg-eye-state"')
-############################################################################
+""
 datalist.query("NumberOfClasses > 50")
 
 ############################################################################
@@ -108,3 +109,39 @@
     alpha=0.8,
     cmap="plasma",
 )
+
+
+############################################################################
+# Edit a created dataset
+# =================================================
+# This example uses the test server, to avoid editing a dataset on the main server.
+openml.config.start_using_configuration_for_example()
+############################################################################
+# Changes to these field edits existing version: allowed only for dataset owner
+data_id = edit_dataset(
+    564,
+    description="xor dataset represents XOR operation",
+    contributor="",
+    collection_date="2019-10-29 17:06:18",
+    original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
+    paper_url="",
+    citation="kaggle",
+    language="English",
+)
+edited_dataset = get_dataset(data_id)
+print(f"Edited dataset ID: {data_id}")
+
+
+############################################################################
+# Changes to these fields: attributes, default_target_attribute,
+# row_id_attribute, ignore_attribute generates a new edited version: allowed for anyone
+
+new_attributes = [
+    ("x0", "REAL"),
+    ("x1", "REAL"),
+    ("y", "REAL"),
+]
+data_id = edit_dataset(564, attributes=new_attributes)
+print(f"Edited dataset ID: {data_id}")
+
+openml.config.stop_using_configuration_for_example()
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 79fa82867..4446f0e90 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -799,6 +799,154 @@ def status_update(data_id, status):
         raise ValueError("Data id/status does not collide")
 
 
+def edit_dataset(
+    data_id,
+    description=None,
+    creator=None,
+    contributor=None,
+    collection_date=None,
+    language=None,
+    attributes=None,
+    data=None,
+    default_target_attribute=None,
+    ignore_attribute=None,
+    citation=None,
+    row_id_attribute=None,
+    original_data_url=None,
+    paper_url=None,
+) -> int:
+    """
+      Edits an OpenMLDataset.
+      Specify atleast one field to edit, apart from data_id
+       - For certain fields, a new dataset version is created : attributes, data,
+       default_target_attribute, ignore_attribute, row_id_attribute.
+
+       - For other fields, the uploader can edit the exisiting version.
+        Noone except the uploader can edit the exisitng version.
+
+      Parameters
+      ----------
+      data_id : int
+          ID of the dataset.
+      description : str
+          Description of the dataset.
+      creator : str
+          The person who created the dataset.
+      contributor : str
+          People who contributed to the current version of the dataset.
+      collection_date : str
+          The date the data was originally collected, given by the uploader.
+      language : str
+          Language in which the data is represented.
+          Starts with 1 upper case letter, rest lower case, e.g. 'English'.
+      attributes : list, dict, or 'auto'
+          A list of tuples. Each tuple consists of the attribute name and type.
+          If passing a pandas DataFrame, the attributes can be automatically
+          inferred by passing ``'auto'``. Specific attributes can be manually
+          specified by a passing a dictionary where the key is the name of the
+          attribute and the value is the data type of the attribute.
+      data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
+          An array that contains both the attributes and the targets. When
+          providing a dataframe, the attribute names and type can be inferred by
+          passing ``attributes='auto'``.
+          The target feature is indicated as meta-data of the dataset.
+      default_target_attribute : str
+          The default target attribute, if it exists.
+          Can have multiple values, comma separated.
+      ignore_attribute : str | list
+          Attributes that should be excluded in modelling,
+          such as identifiers and indexes.
+      citation : str
+          Reference(s) that should be cited when building on this data.
+      row_id_attribute : str, optional
+          The attribute that represents the row-id column, if present in the
+          dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
+          specified, the index of the dataframe will be used as the
+          ``row_id_attribute``. If the name of the index is ``None``, it will
+          be discarded.
+
+          .. versionadded: 0.8
+              Inference of ``row_id_attribute`` from a dataframe.
+      original_data_url : str, optional
+          For derived data, the url to the original dataset.
+      paper_url : str, optional
+          Link to a paper describing the dataset.
+
+
+      Returns
+      -------
+      data_id of the existing edited version or the new version created and published"""
+    if not isinstance(data_id, int):
+        raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
+
+    # case 1, changing these fields creates a new version of the dataset with changed field
+    if any(
+        field is not None
+        for field in [
+            data,
+            attributes,
+            default_target_attribute,
+            row_id_attribute,
+            ignore_attribute,
+        ]
+    ):
+        logger.warning("Creating a new version of dataset, cannot edit existing version")
+        dataset = get_dataset(data_id)
+
+        decoded_arff = dataset._get_arff(format="arff")
+        data_old = decoded_arff["data"]
+        data_new = data if data is not None else data_old
+        dataset_new = create_dataset(
+            name=dataset.name,
+            description=description or dataset.description,
+            creator=creator or dataset.creator,
+            contributor=contributor or dataset.contributor,
+            collection_date=collection_date or dataset.collection_date,
+            language=language or dataset.language,
+            licence=dataset.licence,
+            attributes=attributes or decoded_arff["attributes"],
+            data=data_new,
+            default_target_attribute=default_target_attribute or dataset.default_target_attribute,
+            ignore_attribute=ignore_attribute or dataset.ignore_attribute,
+            citation=citation or dataset.citation,
+            row_id_attribute=row_id_attribute or dataset.row_id_attribute,
+            original_data_url=original_data_url or dataset.original_data_url,
+            paper_url=paper_url or dataset.paper_url,
+            update_comment=dataset.update_comment,
+            version_label=dataset.version_label,
+        )
+        dataset_new.publish()
+        return dataset_new.dataset_id
+
+    # case 2, changing any of these fields will update existing dataset
+    # compose data edit parameters as xml
+    form_data = {"data_id": data_id}
+    xml = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
+    xml["oml:data_edit_parameters"] = OrderedDict()
+    xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
+    xml["oml:data_edit_parameters"]["oml:description"] = description
+    xml["oml:data_edit_parameters"]["oml:creator"] = creator
+    xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
+    xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
+    xml["oml:data_edit_parameters"]["oml:language"] = language
+    xml["oml:data_edit_parameters"]["oml:citation"] = citation
+    xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
+    xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
+
+    # delete None inputs
+    for k in list(xml["oml:data_edit_parameters"]):
+        if not xml["oml:data_edit_parameters"][k]:
+            del xml["oml:data_edit_parameters"][k]
+
+    file_elements = {"edit_parameters": ("description.xml", xmltodict.unparse(xml))}
+    result_xml = openml._api_calls._perform_api_call(
+        "data/edit", "post", data=form_data, file_elements=file_elements
+    )
+    result = xmltodict.parse(result_xml)
+    data_id = result["oml:data_edit"]["oml:id"]
+    return int(data_id)
+
+
 def _get_dataset_description(did_cache_dir, dataset_id):
     """Get the dataset description as xml dictionary.
 
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 958d28d94..c196ea36e 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -16,11 +16,17 @@
 
 import openml
 from openml import OpenMLDataset
-from openml.exceptions import OpenMLCacheException, OpenMLHashException, OpenMLPrivateDatasetError
+from openml.exceptions import (
+    OpenMLCacheException,
+    OpenMLHashException,
+    OpenMLPrivateDatasetError,
+    OpenMLServerException,
+)
 from openml.testing import TestBase
 from openml.utils import _tag_entity, _create_cache_directory_for_id
 from openml.datasets.functions import (
     create_dataset,
+    edit_dataset,
     attributes_arff_from_df,
     _get_cached_dataset,
     _get_cached_dataset_features,
@@ -1331,3 +1337,76 @@ def test_get_dataset_cache_format_feather(self):
         self.assertEqual(X.shape, (150, 5))
         self.assertEqual(len(categorical), X.shape[1])
         self.assertEqual(len(attribute_names), X.shape[1])
+
+    def test_data_edit(self):
+
+        # admin key for test server (only admins or owners can edit datasets).
+        # all users can edit their own datasets)
+        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
+
+        # case 1, editing description, creator, contributor, collection_date, original_data_url,
+        # paper_url, citation, language edits existing dataset.
+        did = 564
+        result = edit_dataset(
+            did,
+            description="xor dataset represents XOR operation",
+            contributor="",
+            collection_date="2019-10-29 17:06:18",
+            original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
+            paper_url="",
+            citation="kaggle",
+            language="English",
+        )
+        self.assertEqual(result, did)
+
+        # case 2, editing data, attributes, default_target_attribute, row_id_attribute,
+        # ignore_attribute generates a new dataset
+
+        column_names = [
+            ("input1", "REAL"),
+            ("input2", "REAL"),
+            ("y", "REAL"),
+        ]
+        desc = "xor dataset represents XOR operation"
+        result = edit_dataset(
+            564,
+            description=desc,
+            contributor="",
+            collection_date="2019-10-29 17:06:18",
+            attributes=column_names,
+            original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
+            paper_url="",
+            citation="kaggle",
+            language="English",
+        )
+        self.assertNotEqual(did, result)
+
+    def test_data_edit_errors(self):
+
+        # admin key for test server (only admins or owners can edit datasets).
+        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
+        # Check server exception when no field to edit is provided
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Please provide atleast one field among description, creator, contributor, "
+            "collection_date, language, citation, original_data_url or paper_url to edit.",
+            edit_dataset,
+            data_id=564,
+        )
+        # Check server exception when unknown dataset is provided
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Unknown dataset",
+            edit_dataset,
+            data_id=100000,
+            description="xor operation dataset",
+        )
+        # Check server exception when a non-owner or non-admin tries to edit existing dataset
+        openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f"
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Dataset is not owned by you",
+            edit_dataset,
+            data_id=564,
+            description="xor data",
+        )

From 666ca68790be90ae1153a6c355b7c1ad9921ef52 Mon Sep 17 00:00:00 2001
From: Neeratyoy Mallik <neeratyoy@gmail.com>
Date: Mon, 3 Aug 2020 11:01:25 +0200
Subject: [PATCH 12/36] Adding support for scikit-learn > 0.22 (#936)

* Preliminary changes

* Updating unit tests for sklearn 0.22 and above

* Triggering sklearn tests + fixes

* Refactoring to inspect.signature in extensions
---
 .travis.yml                                   |   6 +-
 openml/extensions/sklearn/extension.py        |  18 +-
 .../test_sklearn_extension.py                 | 196 ++++++++++++------
 tests/test_flows/test_flow.py                 |  77 +++++--
 tests/test_runs/test_run_functions.py         |  10 +-
 5 files changed, 216 insertions(+), 91 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index dcfda6d37..7360339ac 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,9 +15,13 @@ env:
   - TEST_DIR=/tmp/test_dir/
   - MODULE=openml
   matrix:
-  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
   - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
   - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
   - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2"
   # Checks for older scikit-learn versions (which also don't nicely work with
   # Python3.7)
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index af0b42144..fe9d029aa 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -994,12 +994,16 @@ def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> Tuple[Dict, Set]
             a set with all parameters that do not have a default value
         """
         # parameters with defaults are optional, all others are required.
-        signature = inspect.getfullargspec(fn_name)
-        if signature.defaults:
-            optional_params = dict(zip(reversed(signature.args), reversed(signature.defaults)))
-        else:
-            optional_params = dict()
-        required_params = {arg for arg in signature.args if arg not in optional_params}
+        parameters = inspect.signature(fn_name).parameters
+        required_params = set()
+        optional_params = dict()
+        for param in parameters.keys():
+            parameter = parameters.get(param)
+            default_val = parameter.default  # type: ignore
+            if default_val is inspect.Signature.empty:
+                required_params.add(param)
+            else:
+                optional_params[param] = default_val
         return optional_params, required_params
 
     def _deserialize_model(
@@ -1346,7 +1350,7 @@ def _can_measure_cputime(self, model: Any) -> bool:
         # check the parameters for n_jobs
         n_jobs_vals = SklearnExtension._get_parameter_values_recursive(model.get_params(), "n_jobs")
         for val in n_jobs_vals:
-            if val is not None and val != 1:
+            if val is not None and val != 1 and val != "deprecated":
                 return False
         return True
 
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index 48832b58f..acc93b024 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -77,11 +77,14 @@ def test_serialize_model(self):
                 criterion="entropy", max_features="auto", max_leaf_nodes=2000
             )
 
-            fixture_name = "sklearn.tree.tree.DecisionTreeClassifier"
+            tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes"
+            fixture_name = "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name)
             fixture_short_name = "sklearn.DecisionTreeClassifier"
             # str obtained from self.extension._get_sklearn_description(model)
             fixture_description = "A decision tree classifier."
             version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__
+
+            presort_val = "false" if LooseVersion(sklearn.__version__) < "0.22" else '"deprecated"'
             # min_impurity_decrease has been introduced in 0.20
             # min_impurity_split has been deprecated in 0.20
             if LooseVersion(sklearn.__version__) < "0.19":
@@ -114,12 +117,16 @@ def test_serialize_model(self):
                         ("min_samples_leaf", "1"),
                         ("min_samples_split", "2"),
                         ("min_weight_fraction_leaf", "0.0"),
-                        ("presort", "false"),
+                        ("presort", presort_val),
                         ("random_state", "null"),
                         ("splitter", '"best"'),
                     )
                 )
-            structure_fixture = {"sklearn.tree.tree.DecisionTreeClassifier": []}
+            if LooseVersion(sklearn.__version__) >= "0.22":
+                fixture_parameters.update({"ccp_alpha": "0.0"})
+                fixture_parameters.move_to_end("ccp_alpha", last=False)
+
+            structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []}
 
             serialization = self.extension.model_to_flow(model)
             structure = serialization.get_structure("name")
@@ -161,11 +168,18 @@ def test_serialize_model_clustering(self):
         with mock.patch.object(self.extension, "_check_dependencies") as check_dependencies_mock:
             model = sklearn.cluster.KMeans()
 
-            fixture_name = "sklearn.cluster.k_means_.KMeans"
+            cluster_name = "k_means_" if LooseVersion(sklearn.__version__) < "0.22" else "_kmeans"
+            fixture_name = "sklearn.cluster.{}.KMeans".format(cluster_name)
             fixture_short_name = "sklearn.KMeans"
             # str obtained from self.extension._get_sklearn_description(model)
-            fixture_description = "K-Means clustering"
+            fixture_description = "K-Means clustering{}".format(
+                "" if LooseVersion(sklearn.__version__) < "0.22" else "."
+            )
             version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__
+
+            n_jobs_val = "null" if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"'
+            precomp_val = '"auto"' if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"'
+
             # n_jobs default has changed to None in 0.20
             if LooseVersion(sklearn.__version__) < "0.20":
                 fixture_parameters = OrderedDict(
@@ -192,14 +206,14 @@ def test_serialize_model_clustering(self):
                         ("max_iter", "300"),
                         ("n_clusters", "8"),
                         ("n_init", "10"),
-                        ("n_jobs", "null"),
-                        ("precompute_distances", '"auto"'),
+                        ("n_jobs", n_jobs_val),
+                        ("precompute_distances", precomp_val),
                         ("random_state", "null"),
                         ("tol", "0.0001"),
                         ("verbose", "0"),
                     )
                 )
-            fixture_structure = {"sklearn.cluster.k_means_.KMeans": []}
+            fixture_structure = {"sklearn.cluster.{}.KMeans".format(cluster_name): []}
 
             serialization = self.extension.model_to_flow(model)
             structure = serialization.get_structure("name")
@@ -230,11 +244,15 @@ def test_serialize_model_with_subcomponent(self):
             n_estimators=100, base_estimator=sklearn.tree.DecisionTreeClassifier()
         )
 
+        weight_name = "{}weight_boosting".format(
+            "" if LooseVersion(sklearn.__version__) < "0.22" else "_"
+        )
+        tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes"
         fixture_name = (
-            "sklearn.ensemble.weight_boosting.AdaBoostClassifier"
-            "(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)"
+            "sklearn.ensemble.{}.AdaBoostClassifier"
+            "(base_estimator=sklearn.tree.{}.DecisionTreeClassifier)".format(weight_name, tree_name)
         )
-        fixture_class_name = "sklearn.ensemble.weight_boosting.AdaBoostClassifier"
+        fixture_class_name = "sklearn.ensemble.{}.AdaBoostClassifier".format(weight_name)
         fixture_short_name = "sklearn.AdaBoostClassifier"
         # str obtained from self.extension._get_sklearn_description(model)
         fixture_description = (
@@ -246,13 +264,13 @@ def test_serialize_model_with_subcomponent(self):
             " on difficult cases.\n\nThis class implements the algorithm known "
             "as AdaBoost-SAMME [2]."
         )
-        fixture_subcomponent_name = "sklearn.tree.tree.DecisionTreeClassifier"
-        fixture_subcomponent_class_name = "sklearn.tree.tree.DecisionTreeClassifier"
+        fixture_subcomponent_name = "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name)
+        fixture_subcomponent_class_name = "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name)
         # str obtained from self.extension._get_sklearn_description(model.base_estimator)
         fixture_subcomponent_description = "A decision tree classifier."
         fixture_structure = {
             fixture_name: [],
-            "sklearn.tree.tree.DecisionTreeClassifier": ["base_estimator"],
+            "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): ["base_estimator"],
         }
 
         serialization = self.extension.model_to_flow(model)
@@ -298,10 +316,11 @@ def test_serialize_pipeline(self):
         dummy = sklearn.dummy.DummyClassifier(strategy="prior")
         model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("dummy", dummy)])
 
+        scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
         fixture_name = (
             "sklearn.pipeline.Pipeline("
-            "scaler=sklearn.preprocessing.data.StandardScaler,"
-            "dummy=sklearn.dummy.DummyClassifier)"
+            "scaler=sklearn.preprocessing.{}.StandardScaler,"
+            "dummy=sklearn.dummy.DummyClassifier)".format(scaler_name)
         )
         fixture_short_name = "sklearn.Pipeline(StandardScaler,DummyClassifier)"
 
@@ -327,7 +346,7 @@ def test_serialize_pipeline(self):
 
         fixture_structure = {
             fixture_name: [],
-            "sklearn.preprocessing.data.StandardScaler": ["scaler"],
+            "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["scaler"],
             "sklearn.dummy.DummyClassifier": ["dummy"],
         }
 
@@ -402,10 +421,12 @@ def test_serialize_pipeline_clustering(self):
         km = sklearn.cluster.KMeans()
         model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("clusterer", km)])
 
+        scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
+        cluster_name = "k_means_" if LooseVersion(sklearn.__version__) < "0.22" else "_kmeans"
         fixture_name = (
             "sklearn.pipeline.Pipeline("
-            "scaler=sklearn.preprocessing.data.StandardScaler,"
-            "clusterer=sklearn.cluster.k_means_.KMeans)"
+            "scaler=sklearn.preprocessing.{}.StandardScaler,"
+            "clusterer=sklearn.cluster.{}.KMeans)".format(scaler_name, cluster_name)
         )
         fixture_short_name = "sklearn.Pipeline(StandardScaler,KMeans)"
 
@@ -430,10 +451,9 @@ def test_serialize_pipeline_clustering(self):
             fixture_description = self.extension._get_sklearn_description(model)
         fixture_structure = {
             fixture_name: [],
-            "sklearn.preprocessing.data.StandardScaler": ["scaler"],
-            "sklearn.cluster.k_means_.KMeans": ["clusterer"],
+            "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["scaler"],
+            "sklearn.cluster.{}.KMeans".format(cluster_name): ["clusterer"],
         }
-
         serialization = self.extension.model_to_flow(model)
         structure = serialization.get_structure("name")
 
@@ -519,10 +539,12 @@ def test_serialize_column_transformer(self):
             ],
             remainder="passthrough",
         )
+
+        scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
         fixture = (
             "sklearn.compose._column_transformer.ColumnTransformer("
-            "numeric=sklearn.preprocessing.data.StandardScaler,"
-            "nominal=sklearn.preprocessing._encoders.OneHotEncoder)"
+            "numeric=sklearn.preprocessing.{}.StandardScaler,"
+            "nominal=sklearn.preprocessing._encoders.OneHotEncoder)".format(scaler_name)
         )
         fixture_short_name = "sklearn.ColumnTransformer"
 
@@ -543,7 +565,7 @@ def test_serialize_column_transformer(self):
 
         fixture_structure = {
             fixture: [],
-            "sklearn.preprocessing.data.StandardScaler": ["numeric"],
+            "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["numeric"],
             "sklearn.preprocessing._encoders.OneHotEncoder": ["nominal"],
         }
 
@@ -587,21 +609,26 @@ def test_serialize_column_transformer_pipeline(self):
         model = sklearn.pipeline.Pipeline(
             steps=[("transformer", inner), ("classifier", sklearn.tree.DecisionTreeClassifier())]
         )
+        scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
+        tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes"
         fixture_name = (
             "sklearn.pipeline.Pipeline("
             "transformer=sklearn.compose._column_transformer."
             "ColumnTransformer("
-            "numeric=sklearn.preprocessing.data.StandardScaler,"
+            "numeric=sklearn.preprocessing.{}.StandardScaler,"
             "nominal=sklearn.preprocessing._encoders.OneHotEncoder),"
-            "classifier=sklearn.tree.tree.DecisionTreeClassifier)"
+            "classifier=sklearn.tree.{}.DecisionTreeClassifier)".format(scaler_name, tree_name)
         )
         fixture_structure = {
-            "sklearn.preprocessing.data.StandardScaler": ["transformer", "numeric"],
+            "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): [
+                "transformer",
+                "numeric",
+            ],
             "sklearn.preprocessing._encoders.OneHotEncoder": ["transformer", "nominal"],
             "sklearn.compose._column_transformer.ColumnTransformer(numeric="
-            "sklearn.preprocessing.data.StandardScaler,nominal=sklearn."
-            "preprocessing._encoders.OneHotEncoder)": ["transformer"],
-            "sklearn.tree.tree.DecisionTreeClassifier": ["classifier"],
+            "sklearn.preprocessing.{}.StandardScaler,nominal=sklearn."
+            "preprocessing._encoders.OneHotEncoder)".format(scaler_name): ["transformer"],
+            "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): ["classifier"],
             fixture_name: [],
         }
 
@@ -630,6 +657,7 @@ def test_serialize_column_transformer_pipeline(self):
         structure = serialization.get_structure("name")
         self.assertEqual(serialization.name, fixture_name)
         self.assertEqual(serialization.description, fixture_description)
+
         self.assertDictEqual(structure, fixture_structure)
         # del serialization.model
         new_model = self.extension.flow_to_model(serialization)
@@ -656,15 +684,18 @@ def test_serialize_feature_union(self):
         structure = serialization.get_structure("name")
         # OneHotEncoder was moved to _encoders module in 0.20
         module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data"
+        scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
         fixture_name = (
             "sklearn.pipeline.FeatureUnion("
             "ohe=sklearn.preprocessing.{}.OneHotEncoder,"
-            "scaler=sklearn.preprocessing.data.StandardScaler)".format(module_name_encoder)
+            "scaler=sklearn.preprocessing.{}.StandardScaler)".format(
+                module_name_encoder, scaler_name
+            )
         )
         fixture_structure = {
             fixture_name: [],
             "sklearn.preprocessing.{}." "OneHotEncoder".format(module_name_encoder): ["ohe"],
-            "sklearn.preprocessing.data.StandardScaler": ["scaler"],
+            "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["scaler"],
         }
         self.assertEqual(serialization.name, fixture_name)
         self.assertDictEqual(structure, fixture_structure)
@@ -728,17 +759,20 @@ def test_serialize_feature_union_switched_names(self):
         fu2_serialization = self.extension.model_to_flow(fu2)
         # OneHotEncoder was moved to _encoders module in 0.20
         module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data"
+        scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
         self.assertEqual(
             fu1_serialization.name,
             "sklearn.pipeline.FeatureUnion("
             "ohe=sklearn.preprocessing.{}.OneHotEncoder,"
-            "scaler=sklearn.preprocessing.data.StandardScaler)".format(module_name_encoder),
+            "scaler=sklearn.preprocessing.{}.StandardScaler)".format(
+                module_name_encoder, scaler_name
+            ),
         )
         self.assertEqual(
             fu2_serialization.name,
             "sklearn.pipeline.FeatureUnion("
             "scaler=sklearn.preprocessing.{}.OneHotEncoder,"
-            "ohe=sklearn.preprocessing.data.StandardScaler)".format(module_name_encoder),
+            "ohe=sklearn.preprocessing.{}.StandardScaler)".format(module_name_encoder, scaler_name),
         )
 
     def test_serialize_complex_flow(self):
@@ -766,10 +800,15 @@ def test_serialize_complex_flow(self):
         # OneHotEncoder was moved to _encoders module in 0.20
         module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data"
         ohe_name = "sklearn.preprocessing.%s.OneHotEncoder" % module_name_encoder
-        scaler_name = "sklearn.preprocessing.data.StandardScaler"
-        tree_name = "sklearn.tree.tree.DecisionTreeClassifier"
-        boosting_name = (
-            "sklearn.ensemble.weight_boosting.AdaBoostClassifier" "(base_estimator=%s)" % tree_name
+        scaler_name = "sklearn.preprocessing.{}.StandardScaler".format(
+            "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data"
+        )
+        tree_name = "sklearn.tree.{}.DecisionTreeClassifier".format(
+            "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes"
+        )
+        weight_name = "weight" if LooseVersion(sklearn.__version__) < "0.22" else "_weight"
+        boosting_name = "sklearn.ensemble.{}_boosting.AdaBoostClassifier(base_estimator={})".format(
+            weight_name, tree_name
         )
         pipeline_name = "sklearn.pipeline.Pipeline(ohe=%s,scaler=%s," "boosting=%s)" % (
             ohe_name,
@@ -1195,12 +1234,24 @@ def test__get_fn_arguments_with_defaults(self):
                 (sklearn.tree.DecisionTreeClassifier.__init__, 13),
                 (sklearn.pipeline.Pipeline.__init__, 1),
             ]
-        else:
+        elif sklearn_version < "0.22":
             fns = [
                 (sklearn.ensemble.RandomForestRegressor.__init__, 16),
                 (sklearn.tree.DecisionTreeClassifier.__init__, 13),
                 (sklearn.pipeline.Pipeline.__init__, 2),
             ]
+        elif sklearn_version < "0.23":
+            fns = [
+                (sklearn.ensemble.RandomForestRegressor.__init__, 18),
+                (sklearn.tree.DecisionTreeClassifier.__init__, 14),
+                (sklearn.pipeline.Pipeline.__init__, 2),
+            ]
+        else:
+            fns = [
+                (sklearn.ensemble.RandomForestRegressor.__init__, 18),
+                (sklearn.tree.DecisionTreeClassifier.__init__, 14),
+                (sklearn.pipeline.Pipeline.__init__, 2),
+            ]
 
         for fn, num_params_with_defaults in fns:
             defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn)
@@ -1225,11 +1276,18 @@ def test_deserialize_with_defaults(self):
         pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
 
         pipe_adjusted = sklearn.clone(pipe_orig)
-        params = {
-            "Imputer__strategy": "median",
-            "OneHotEncoder__sparse": False,
-            "Estimator__min_samples_leaf": 42,
-        }
+        if LooseVersion(sklearn.__version__) < "0.23":
+            params = {
+                "Imputer__strategy": "median",
+                "OneHotEncoder__sparse": False,
+                "Estimator__min_samples_leaf": 42,
+            }
+        else:
+            params = {
+                "Imputer__strategy": "mean",
+                "OneHotEncoder__sparse": True,
+                "Estimator__min_samples_leaf": 1,
+            }
         pipe_adjusted.set_params(**params)
         flow = self.extension.model_to_flow(pipe_adjusted)
         pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True)
@@ -1256,11 +1314,18 @@ def test_deserialize_adaboost_with_defaults(self):
         pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
 
         pipe_adjusted = sklearn.clone(pipe_orig)
-        params = {
-            "Imputer__strategy": "median",
-            "OneHotEncoder__sparse": False,
-            "Estimator__n_estimators": 10,
-        }
+        if LooseVersion(sklearn.__version__) < "0.22":
+            params = {
+                "Imputer__strategy": "median",
+                "OneHotEncoder__sparse": False,
+                "Estimator__n_estimators": 10,
+            }
+        else:
+            params = {
+                "Imputer__strategy": "mean",
+                "OneHotEncoder__sparse": True,
+                "Estimator__n_estimators": 50,
+            }
         pipe_adjusted.set_params(**params)
         flow = self.extension.model_to_flow(pipe_adjusted)
         pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True)
@@ -1293,14 +1358,24 @@ def test_deserialize_complex_with_defaults(self):
         pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
 
         pipe_adjusted = sklearn.clone(pipe_orig)
-        params = {
-            "Imputer__strategy": "median",
-            "OneHotEncoder__sparse": False,
-            "Estimator__n_estimators": 10,
-            "Estimator__base_estimator__n_estimators": 10,
-            "Estimator__base_estimator__base_estimator__learning_rate": 0.1,
-            "Estimator__base_estimator__base_estimator__loss__n_neighbors": 13,
-        }
+        if LooseVersion(sklearn.__version__) < "0.23":
+            params = {
+                "Imputer__strategy": "median",
+                "OneHotEncoder__sparse": False,
+                "Estimator__n_estimators": 10,
+                "Estimator__base_estimator__n_estimators": 10,
+                "Estimator__base_estimator__base_estimator__learning_rate": 0.1,
+                "Estimator__base_estimator__base_estimator__loss__n_neighbors": 13,
+            }
+        else:
+            params = {
+                "Imputer__strategy": "mean",
+                "OneHotEncoder__sparse": True,
+                "Estimator__n_estimators": 50,
+                "Estimator__base_estimator__n_estimators": 10,
+                "Estimator__base_estimator__base_estimator__learning_rate": 0.1,
+                "Estimator__base_estimator__base_estimator__loss__n_neighbors": 5,
+            }
         pipe_adjusted.set_params(**params)
         flow = self.extension.model_to_flow(pipe_adjusted)
         pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True)
@@ -1349,7 +1424,10 @@ def test_openml_param_name_to_sklearn(self):
     def test_obtain_parameter_values_flow_not_from_server(self):
         model = sklearn.linear_model.LogisticRegression(solver="lbfgs")
         flow = self.extension.model_to_flow(model)
-        msg = "Flow sklearn.linear_model.logistic.LogisticRegression has no " "flow_id!"
+        logistic_name = "logistic" if LooseVersion(sklearn.__version__) < "0.22" else "_logistic"
+        msg = "Flow sklearn.linear_model.{}.LogisticRegression has no flow_id!".format(
+            logistic_name
+        )
 
         with self.assertRaisesRegex(ValueError, msg):
             self.extension.obtain_parameter_values(flow)
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index 9f289870e..8d08f4eaf 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -305,15 +305,27 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock):
                 "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)
             )
 
-        fixture = (
-            "The flow on the server is inconsistent with the local flow. "
-            "The server flow ID is 1. Please check manually and remove "
-            "the flow if necessary! Error is:\n"
-            "'Flow sklearn.ensemble.forest.RandomForestClassifier: "
-            "values for attribute 'name' differ: "
-            "'sklearn.ensemble.forest.RandomForestClassifier'"
-            "\nvs\n'sklearn.ensemble.forest.RandomForestClassifie'.'"
-        )
+        if LooseVersion(sklearn.__version__) < "0.22":
+            fixture = (
+                "The flow on the server is inconsistent with the local flow. "
+                "The server flow ID is 1. Please check manually and remove "
+                "the flow if necessary! Error is:\n"
+                "'Flow sklearn.ensemble.forest.RandomForestClassifier: "
+                "values for attribute 'name' differ: "
+                "'sklearn.ensemble.forest.RandomForestClassifier'"
+                "\nvs\n'sklearn.ensemble.forest.RandomForestClassifie'.'"
+            )
+        else:
+            # sklearn.ensemble.forest -> sklearn.ensemble._forest
+            fixture = (
+                "The flow on the server is inconsistent with the local flow. "
+                "The server flow ID is 1. Please check manually and remove "
+                "the flow if necessary! Error is:\n"
+                "'Flow sklearn.ensemble._forest.RandomForestClassifier: "
+                "values for attribute 'name' differ: "
+                "'sklearn.ensemble._forest.RandomForestClassifier'"
+                "\nvs\n'sklearn.ensemble._forest.RandomForestClassifie'.'"
+            )
 
         self.assertEqual(context_manager.exception.args[0], fixture)
         self.assertEqual(get_flow_mock.call_count, 2)
@@ -463,19 +475,40 @@ def test_sklearn_to_upload_to_flow(self):
 
         # OneHotEncoder was moved to _encoders module in 0.20
         module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data"
-        fixture_name = (
-            "%ssklearn.model_selection._search.RandomizedSearchCV("
-            "estimator=sklearn.pipeline.Pipeline("
-            "ohe=sklearn.preprocessing.%s.OneHotEncoder,"
-            "scaler=sklearn.preprocessing.data.StandardScaler,"
-            "fu=sklearn.pipeline.FeatureUnion("
-            "pca=sklearn.decomposition.truncated_svd.TruncatedSVD,"
-            "fs="
-            "sklearn.feature_selection.univariate_selection.SelectPercentile),"
-            "boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier("
-            "base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))"
-            % (sentinel, module_name_encoder)
-        )
+        if LooseVersion(sklearn.__version__) < "0.22":
+            fixture_name = (
+                "%ssklearn.model_selection._search.RandomizedSearchCV("
+                "estimator=sklearn.pipeline.Pipeline("
+                "ohe=sklearn.preprocessing.%s.OneHotEncoder,"
+                "scaler=sklearn.preprocessing.data.StandardScaler,"
+                "fu=sklearn.pipeline.FeatureUnion("
+                "pca=sklearn.decomposition.truncated_svd.TruncatedSVD,"
+                "fs="
+                "sklearn.feature_selection.univariate_selection.SelectPercentile),"
+                "boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier("
+                "base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))"
+                % (sentinel, module_name_encoder)
+            )
+        else:
+            # sklearn.sklearn.preprocessing.data -> sklearn.sklearn.preprocessing._data
+            # sklearn.sklearn.decomposition.truncated_svd -> sklearn.decomposition._truncated_svd
+            # sklearn.feature_selection.univariate_selection ->
+            #     sklearn.feature_selection._univariate_selection
+            # sklearn.ensemble.weight_boosting -> sklearn.ensemble._weight_boosting
+            # sklearn.tree.tree.DecisionTree... -> sklearn.tree._classes.DecisionTree...
+            fixture_name = (
+                "%ssklearn.model_selection._search.RandomizedSearchCV("
+                "estimator=sklearn.pipeline.Pipeline("
+                "ohe=sklearn.preprocessing.%s.OneHotEncoder,"
+                "scaler=sklearn.preprocessing._data.StandardScaler,"
+                "fu=sklearn.pipeline.FeatureUnion("
+                "pca=sklearn.decomposition._truncated_svd.TruncatedSVD,"
+                "fs="
+                "sklearn.feature_selection._univariate_selection.SelectPercentile),"
+                "boosting=sklearn.ensemble._weight_boosting.AdaBoostClassifier("
+                "base_estimator=sklearn.tree._classes.DecisionTreeClassifier)))"
+                % (sentinel, module_name_encoder)
+            )
         self.assertEqual(new_flow.name, fixture_name)
         new_flow.model.fit(X, y)
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 74f011b7c..aca9580c9 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -199,8 +199,11 @@ def _perform_run(
         classes_without_random_state = [
             "sklearn.model_selection._search.GridSearchCV",
             "sklearn.pipeline.Pipeline",
-            "sklearn.linear_model.base.LinearRegression",
         ]
+        if LooseVersion(sklearn.__version__) < "0.22":
+            classes_without_random_state.append("sklearn.linear_model.base.LinearRegression")
+        else:
+            classes_without_random_state.append("sklearn.linear_model._base.LinearRegression")
 
         def _remove_random_state(flow):
             if "random_state" in flow.parameters:
@@ -779,10 +782,13 @@ def _test_local_evaluations(self, run):
             (sklearn.metrics.cohen_kappa_score, {"weights": None}),
             (sklearn.metrics.roc_auc_score, {}),
             (sklearn.metrics.average_precision_score, {}),
-            (sklearn.metrics.jaccard_similarity_score, {}),
             (sklearn.metrics.precision_score, {"average": "macro"}),
             (sklearn.metrics.brier_score_loss, {}),
         ]
+        if LooseVersion(sklearn.__version__) < "0.23":
+            tests.append((sklearn.metrics.jaccard_similarity_score, {}))
+        else:
+            tests.append((sklearn.metrics.jaccard_score, {}))
         for test_idx, test in enumerate(tests):
             alt_scores = run.get_metric_fn(sklearn_fn=test[0], kwargs=test[1],)
             self.assertEqual(len(alt_scores), 10)

From 5d9c69c210792d8b447c8b17d466ac44e41d0eb2 Mon Sep 17 00:00:00 2001
From: zikun <33176974+zikun@users.noreply.github.com>
Date: Mon, 3 Aug 2020 22:48:44 +0800
Subject: [PATCH 13/36] Add flake8-print in pre-commit (#939)

* Add flake8-print in pre-commit config

* Replace print statements with logging
---
 .flake8                                       | 2 +-
 .pre-commit-config.yaml                       | 4 ++++
 openml/extensions/sklearn/extension.py        | 2 +-
 tests/conftest.py                             | 1 -
 tests/test_datasets/test_dataset_functions.py | 4 +++-
 tests/test_study/test_study_examples.py       | 6 ++++--
 6 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/.flake8 b/.flake8
index c0fe5e06f..08bb8ea10 100644
--- a/.flake8
+++ b/.flake8
@@ -1,7 +1,7 @@
 [flake8]
 max-line-length = 100
 show-source = True
-select = C,E,F,W,B
+select = C,E,F,W,B,T
 ignore = E203, E402, W503
 per-file-ignores =
     *__init__.py:F401
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 75e53f0dd..b3a1d2aba 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,6 +19,10 @@ repos:
       - id: flake8
         name: flake8 openml
         files: openml/*
+        additional_dependencies:
+          - flake8-print==3.1.4
       - id: flake8
         name: flake8 tests
         files: tests/*
+        additional_dependencies:
+          - flake8-print==3.1.4
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index fe9d029aa..4a3015bdc 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -1316,7 +1316,7 @@ def _prevent_optimize_n_jobs(self, model):
                         "Could not find attribute "
                         "param_distributions."
                     )
-                print(
+                logger.warning(
                     "Warning! Using subclass BaseSearchCV other than "
                     "{GridSearchCV, RandomizedSearchCV}. "
                     "Should implement param check. "
diff --git a/tests/conftest.py b/tests/conftest.py
index 59fa33aca..461a513fd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -40,7 +40,6 @@
 # exploiting the fact that conftest.py always resides in the root directory for tests
 static_dir = os.path.dirname(os.path.abspath(__file__))
 logger.info("static directory: {}".format(static_dir))
-print("static directory: {}".format(static_dir))
 while True:
     if "openml" in os.listdir(static_dir):
         break
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index c196ea36e..a3be7b2b7 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -1160,7 +1160,9 @@ def test_publish_fetch_ignore_attribute(self):
             except Exception as e:
                 # returned code 273: Dataset not processed yet
                 # returned code 362: No qualities found
-                print("Failed to fetch dataset:{} with '{}'.".format(dataset.id, str(e)))
+                TestBase.logger.error(
+                    "Failed to fetch dataset:{} with '{}'.".format(dataset.id, str(e))
+                )
                 time.sleep(10)
                 continue
         if downloaded_dataset is None:
diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py
index 2c403aa84..14e2405f2 100644
--- a/tests/test_study/test_study_examples.py
+++ b/tests/test_study/test_study_examples.py
@@ -48,10 +48,12 @@ def test_Figure1a(self):
                 clf, task, avoid_duplicate_runs=False
             )  # run classifier on splits (requires API key)
             score = run.get_metric_fn(sklearn.metrics.accuracy_score)  # print accuracy score
-            print("Data set: %s; Accuracy: %0.2f" % (task.get_dataset().name, score.mean()))
+            TestBase.logger.info(
+                "Data set: %s; Accuracy: %0.2f" % (task.get_dataset().name, score.mean())
+            )
             run.publish()  # publish the experiment on OpenML (optional)
             TestBase._mark_entity_for_removal("run", run.run_id)
             TestBase.logger.info(
                 "collected from {}: {}".format(__file__.split("/")[-1], run.run_id)
             )
-            print("URL for run: %s/run/%d" % (openml.config.server, run.run_id))
+            TestBase.logger.info("URL for run: %s/run/%d" % (openml.config.server, run.run_id))

From 7d51a766f0d5540d416de3f149645a3b6ad4b282 Mon Sep 17 00:00:00 2001
From: Sahithya Ravi <44670788+sahithyaravi1493@users.noreply.github.com>
Date: Fri, 7 Aug 2020 10:05:40 +0200
Subject: [PATCH 14/36] Fix edit api (#940)

* fix edit api
---
 openml/datasets/functions.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 4446f0e90..bda02d419 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -891,10 +891,18 @@ def edit_dataset(
         ]
     ):
         logger.warning("Creating a new version of dataset, cannot edit existing version")
+
+        # Get old dataset and features
         dataset = get_dataset(data_id)
+        df, y, categorical, attribute_names = dataset.get_data(dataset_format="dataframe")
+        attributes_old = attributes_arff_from_df(df)
 
-        decoded_arff = dataset._get_arff(format="arff")
-        data_old = decoded_arff["data"]
+        # Sparse data needs to be provided in a different format from dense data
+        if dataset.format == "sparse_arff":
+            df, y, categorical, attribute_names = dataset.get_data(dataset_format="array")
+            data_old = coo_matrix(df)
+        else:
+            data_old = df
         data_new = data if data is not None else data_old
         dataset_new = create_dataset(
             name=dataset.name,
@@ -904,7 +912,7 @@ def edit_dataset(
             collection_date=collection_date or dataset.collection_date,
             language=language or dataset.language,
             licence=dataset.licence,
-            attributes=attributes or decoded_arff["attributes"],
+            attributes=attributes or attributes_old,
             data=data_new,
             default_target_attribute=default_target_attribute or dataset.default_target_attribute,
             ignore_attribute=ignore_attribute or dataset.ignore_attribute,

From 75a5440094c643cca7b97d91bfb8b1046df0b82f Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 12 Aug 2020 18:56:50 +0200
Subject: [PATCH 15/36] Update subflow paragraph

---
 examples/30_extended/custom_flow_tutorial.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index 70c4adfb8..e0ee6a21c 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -71,10 +71,14 @@
 )
 
 ####################################################################################################
-# It is possible for flows to contain subflows. In this example, the auto-sklearn flow is a
-# subflow, this means that the subflow is entirely executed as part of this flow.
-# Using this modularity also allows your runs to specify which hyperparameters of the
-# subflows were used!
+# It is possible to build a flow which uses other flows.
+# For example, the Random Forest Classifier is a flow, but you could also construct a flow
+# which uses a Random Forest Classifier in a ML pipeline. When constructing the pipeline flow,
+# you can use the Random Forest Classifier flow as a *subflow*. It allows for
+# all hyperparameters of the Random Classifier Flow to also be specified in your pipeline flow.
+#
+# In this example, the auto-sklearn flow is a subflow: the auto-sklearn flow is entirely executed as part of this flow.
+# This allows people to specify auto-sklearn hyperparameters used in this flow.
 # Using a subflow is not required.
 #
 # Note: flow 15275 is not actually the right flow on the test server,

From 23a08ab16a159048d84210f6864e322a5e3749c7 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 14 Aug 2020 14:38:21 +0200
Subject: [PATCH 16/36] Check the ClassificationTask has class label set

---
 openml/runs/functions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 77b803c6c..ef8880b23 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -1064,6 +1064,8 @@ def format_prediction(
     if isinstance(task, OpenMLClassificationTask):
         if proba is None:
             raise ValueError("Predicted Class Probabilities are required for classification task")
+        if task.class_labels is None:
+            raise ValueError("The classification task must have class labels set")
         if not set(task.class_labels) == set(proba):
             raise ValueError("Each class should have a predicted probability")
         if sample is None:

From 95d1fcb8abfec03393e92eda1ad09b1ca29e3381 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Mon, 17 Aug 2020 10:38:31 +0200
Subject: [PATCH 17/36] Test task is of supported type

---
 openml/runs/functions.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index ef8880b23..0dfc68d60 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -830,7 +830,7 @@ def list_runs(
     study: Optional[int] = None,
     display_errors: bool = False,
     output_format: str = "dict",
-    **kwargs
+    **kwargs,
 ) -> Union[Dict, pd.DataFrame]:
     """
     List all runs matching all of the given filters.
@@ -902,7 +902,7 @@ def list_runs(
         tag=tag,
         study=study,
         display_errors=display_errors,
-        **kwargs
+        **kwargs,
     )
 
 
@@ -915,7 +915,7 @@ def _list_runs(
     study: Optional[int] = None,
     display_errors: bool = False,
     output_format: str = "dict",
-    **kwargs
+    **kwargs,
 ) -> Union[Dict, pd.DataFrame]:
     """
     Perform API call `/run/list/{filters}'
@@ -1075,4 +1075,7 @@ def format_prediction(
                 sample = 0
         probabilities = [proba[c] for c in task.class_labels]
         return [repeat, fold, sample, index, *probabilities, truth, prediction]
-    return [repeat, fold, index, truth, prediction]
+    elif isinstance(task, OpenMLRegressionTask):
+        return [repeat, fold, index, truth, prediction]
+    else:
+        raise TypeError(f"Formatting for {type(task)} is not supported.")

From 41aa789363100d0d911da07312dc81de64bfb5ce Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Mon, 17 Aug 2020 10:42:28 +0200
Subject: [PATCH 18/36] Add tests for format_prediction

---
 tests/test_runs/test_run_functions.py | 46 ++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 74f011b7c..2c46e0fcd 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -22,10 +22,7 @@
 
 import openml.extensions.sklearn
 from openml.testing import TestBase, SimpleImputer
-from openml.runs.functions import (
-    _run_task_get_arffcontent,
-    run_exists,
-)
+from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction
 from openml.runs.trace import OpenMLRunTrace
 from openml.tasks import TaskTypeEnum
 
@@ -1336,3 +1333,44 @@ def test_run_flow_on_task_downloaded_flow(self):
         run.publish()
         TestBase._mark_entity_for_removal("run", run.run_id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], run.run_id))
+
+    def test_format_prediction_non_supervised(self):
+        # non-supervised tasks don't exist on the test server
+        openml.config.server = self.production_server
+        clustering = openml.tasks.get_task(126033, download_data=False)
+        ignored_input = [0] * 5
+        with self.assertRaises(TypeError):
+            format_prediction(clustering, *ignored_input)
+
+    def test_format_prediction_classification_no_probabilities(self):
+        classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
+        ignored_input = [0] * 5
+        with self.assertRaises(ValueError):
+            format_prediction(classification, *ignored_input, proba=None)
+
+    def test_format_prediction_classification_incomplete_probabilities(self):
+        classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
+        ignored_input = [0] * 5
+        incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]}
+        with self.assertRaises(ValueError):
+            format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
+
+    def test_format_prediction_task_without_classlabels_set(self):
+        classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
+        classification.class_labels = None
+        ignored_input = [0] * 5
+        with self.assertRaises(ValueError):
+            format_prediction(classification, *ignored_input, proba={})
+
+    def test_format_prediction_task_learning_curve_sample_not_set(self):
+        learning_curve = openml.tasks.get_task(801, download_data=False)
+        probabilities = {c: 0.2 for c in learning_curve.class_labels}
+        ignored_input = [0] * 5
+        with self.assertRaises(ValueError):
+            format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
+
+    def test_format_prediction_task_regression(self):
+        regression = openml.tasks.get_task(self.TEST_SERVER_TASK_REGRESSION[0], download_data=False)
+        ignored_input = [0] * 5
+        res = format_prediction(regression, *ignored_input)
+        self.assertListEqual(res, [0] * 5)

From 5d2e0ce980bfee2de5197e27c1e03c7518665a3b Mon Sep 17 00:00:00 2001
From: Neeratyoy Mallik <neeratyoy@gmail.com>
Date: Mon, 17 Aug 2020 10:42:59 +0200
Subject: [PATCH 19/36] Adding Python 3.8 support (#916)

* Adding Python 3.8 support

* Fixing indentation

* Execute test cases for 3.8

* Testing

* Making install script fail
---
 .travis.yml           | 26 ++++++++++++++------------
 ci_scripts/install.sh |  2 ++
 setup.py              |  1 +
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 7360339ac..80f3bda42 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,18 +15,20 @@ env:
   - TEST_DIR=/tmp/test_dir/
   - MODULE=openml
   matrix:
-  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
-  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
-  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
-  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
-  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
-  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
-  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
-  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2"
-  # Checks for older scikit-learn versions (which also don't nicely work with
-  # Python3.7)
-  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
-  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0
+    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
+    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
+    - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
+    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
+    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
+    - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
+    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
+    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
+    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
+    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2"
+    # Checks for older scikit-learn versions (which also don't nicely work with
+    # Python3.7)
+    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
+    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0
 
 # Travis issue
 # https://github.com/travis-ci/travis-ci/issues/8920
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index 67cd1bb38..29181c5c4 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -1,5 +1,7 @@
 # License: BSD 3-Clause
 
+set -e
+
 # Deactivate the travis-provided virtual environment and setup a
 # conda-based environment instead
 deactivate
diff --git a/setup.py b/setup.py
index f1f7a5871..476becc10 100644
--- a/setup.py
+++ b/setup.py
@@ -96,5 +96,6 @@
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
     ],
 )

From 5ef24ab2d5c88aba728496cbd7c324b4aa2e33c3 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 25 Aug 2020 14:21:59 +0200
Subject: [PATCH 20/36] Process feedback Neeratyoy

---
 examples/30_extended/custom_flow_tutorial.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index e0ee6a21c..737c67c75 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -4,7 +4,8 @@
 ================================
 
 The most convenient way to create a flow for your machine learning workflow is to generate it
-automatically as described in <>. However, there are scenarios where this is not possible, such
+automatically as described in the `Obtain Flow IDs <https://openml.github.io/openml-python/master/examples/30_extended/flow_id_tutorial.html#sphx-glr-examples-30-extended-flow-id-tutorial-py>`_ tutorial.  # noqa E501
+However, there are scenarios where this is not possible, such
 as when the flow uses a framework without an extension or when the flow is described by a script.
 
 In those cases you can still create a custom flow by following the steps of this tutorial.
@@ -58,7 +59,7 @@
 ####################################################################################################
 # Next we define the flow hyperparameters. We define their name and default value in `parameters`,
 # and provide meta-data for each hyperparameter through `parameters_meta_info`.
-# Note that eventhough the argument name is `parameters` they describe the hyperparameters.
+# Note that even though the argument name is `parameters` they describe the hyperparameters.
 # The use of ordered dicts is required.
 
 flow_hyperparameters = dict(
@@ -79,7 +80,7 @@
 #
 # In this example, the auto-sklearn flow is a subflow: the auto-sklearn flow is entirely executed as part of this flow.
 # This allows people to specify auto-sklearn hyperparameters used in this flow.
-# Using a subflow is not required.
+# In general, using a subflow is not required.
 #
 # Note: flow 15275 is not actually the right flow on the test server,
 # but that does not matter for this demonstration.

From 1ce5a12a85110c0295c42d39f7ebe97576bfef12 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 28 Aug 2020 17:05:24 +0200
Subject: [PATCH 21/36] Test Exception with Regex

Also throw NotImplementedError instead of TypeError for unsupported task
types. Added links in the example.
---
 examples/30_extended/custom_flow_tutorial.py |  6 ++++--
 openml/runs/functions.py                     |  4 ++--
 tests/test_runs/test_run_functions.py        | 14 +++++++++-----
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index 737c67c75..3b918e108 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -9,7 +9,7 @@
 as when the flow uses a framework without an extension or when the flow is described by a script.
 
 In those cases you can still create a custom flow by following the steps of this tutorial.
-As an example we will use the flows generated for the AutoML Benchmark (...),
+As an example we will use the flows generated for the `AutoML Benchmark <https://openml.github.io/automlbenchmark/>`_,
 and also show how to link runs to the custom flow.
 """
 
@@ -32,7 +32,7 @@
 # 1. Defining the flow
 # ====================
 # The first step is to define all the hyperparameters of your flow.
-# Check ... for the descriptions of each variable.
+# The API pages feature a descriptions of each variable of the `OpenMLFlow <https://openml.github.io/openml-python/master/generated/openml.OpenMLFlow.html#openml.OpenMLFlow>`_.  # noqa E501
 # Note that `external version` and `name` together uniquely identify a flow.
 #
 # The AutoML Benchmark runs AutoML systems across a range of tasks.
@@ -201,3 +201,5 @@
 )
 my_run.publish()
 print("run created:", my_run.run_id)
+
+openml.config.stop_using_configuration_for_example()
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 0dfc68d60..a3888d3a1 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -1063,7 +1063,7 @@ def format_prediction(
     """
     if isinstance(task, OpenMLClassificationTask):
         if proba is None:
-            raise ValueError("Predicted Class Probabilities are required for classification task")
+            raise ValueError("`proba` is required for classification task")
         if task.class_labels is None:
             raise ValueError("The classification task must have class labels set")
         if not set(task.class_labels) == set(proba):
@@ -1078,4 +1078,4 @@ def format_prediction(
     elif isinstance(task, OpenMLRegressionTask):
         return [repeat, fold, index, truth, prediction]
     else:
-        raise TypeError(f"Formatting for {type(task)} is not supported.")
+        raise NotImplementedError(f"Formatting for {type(task)} is not supported.")
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 2c46e0fcd..6e92552c6 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1339,34 +1339,38 @@ def test_format_prediction_non_supervised(self):
         openml.config.server = self.production_server
         clustering = openml.tasks.get_task(126033, download_data=False)
         ignored_input = [0] * 5
-        with self.assertRaises(TypeError):
+        with self.assertRaisesRegex(
+            NotImplementedError, r"Formatting for <class '[\w.]+'> is not supported."
+        ):
             format_prediction(clustering, *ignored_input)
 
     def test_format_prediction_classification_no_probabilities(self):
         classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
         ignored_input = [0] * 5
-        with self.assertRaises(ValueError):
+        with self.assertRaisesRegex(ValueError, "`proba` is required for classification task"):
             format_prediction(classification, *ignored_input, proba=None)
 
     def test_format_prediction_classification_incomplete_probabilities(self):
         classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
         ignored_input = [0] * 5
         incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]}
-        with self.assertRaises(ValueError):
+        with self.assertRaisesRegex(ValueError, "Each class should have a predicted probability"):
             format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
 
     def test_format_prediction_task_without_classlabels_set(self):
         classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
         classification.class_labels = None
         ignored_input = [0] * 5
-        with self.assertRaises(ValueError):
+        with self.assertRaisesRegex(
+            ValueError, "The classification task must have class labels set"
+        ):
             format_prediction(classification, *ignored_input, proba={})
 
     def test_format_prediction_task_learning_curve_sample_not_set(self):
         learning_curve = openml.tasks.get_task(801, download_data=False)
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
         ignored_input = [0] * 5
-        with self.assertRaises(ValueError):
+        with self.assertRaisesRegex(ValueError, "`sample` can not be none for LearningCurveTask"):
             format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
 
     def test_format_prediction_task_regression(self):

From f70c720c1624e3fadc52909885a4d3a096cd7214 Mon Sep 17 00:00:00 2001
From: Sahithya Ravi <44670788+sahithyaravi1493@users.noreply.github.com>
Date: Mon, 31 Aug 2020 20:27:31 +0200
Subject: [PATCH 22/36] change edit_api to reflect server (#941)

* change edit_api to reflect server

* change test and example to reflect rest API changes

* tutorial comments

* Update datasets_tutorial.py
---
 examples/30_extended/datasets_tutorial.py     | 38 ++++----
 openml/datasets/functions.py                  | 64 +-------------
 tests/test_datasets/test_dataset_functions.py | 87 +++++++++----------
 3 files changed, 64 insertions(+), 125 deletions(-)

diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
index 40b35bbea..e129b7718 100644
--- a/examples/30_extended/datasets_tutorial.py
+++ b/examples/30_extended/datasets_tutorial.py
@@ -21,7 +21,7 @@
 #
 #   * Use the output_format parameter to select output type
 #   * Default gives 'dict' (other option: 'dataframe', see below)
-
+#
 openml_list = openml.datasets.list_datasets()  # returns a dict
 
 # Show a nice table with some key data properties
@@ -117,15 +117,21 @@
 # This example uses the test server, to avoid editing a dataset on the main server.
 openml.config.start_using_configuration_for_example()
 ############################################################################
-# Changes to these field edits existing version: allowed only for dataset owner
+# Edit non-critical fields, allowed for all authorized users:
+# description, creator, contributor, collection_date, language, citation,
+# original_data_url, paper_url
+desc = (
+    "This data sets consists of 3 different types of irises' "
+    "(Setosa, Versicolour, and Virginica) petal and sepal length,"
+    " stored in a 150x4 numpy.ndarray"
+)
+did = 128
 data_id = edit_dataset(
-    564,
-    description="xor dataset represents XOR operation",
-    contributor="",
-    collection_date="2019-10-29 17:06:18",
-    original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
-    paper_url="",
-    citation="kaggle",
+    did,
+    description=desc,
+    creator="R.A.Fisher",
+    collection_date="1937",
+    citation="The use of multiple measurements in taxonomic problems",
     language="English",
 )
 edited_dataset = get_dataset(data_id)
@@ -133,15 +139,11 @@
 
 
 ############################################################################
-# Changes to these fields: attributes, default_target_attribute,
-# row_id_attribute, ignore_attribute generates a new edited version: allowed for anyone
-
-new_attributes = [
-    ("x0", "REAL"),
-    ("x1", "REAL"),
-    ("y", "REAL"),
-]
-data_id = edit_dataset(564, attributes=new_attributes)
+# Edit critical fields, allowed only for owners of the dataset:
+# default_target_attribute, row_id_attribute, ignore_attribute
+# To edit critical fields of a dataset owned by you, configure the API key:
+# openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
+data_id = edit_dataset(564, default_target_attribute="y")
 print(f"Edited dataset ID: {data_id}")
 
 openml.config.stop_using_configuration_for_example()
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index bda02d419..0f3037a74 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -806,8 +806,6 @@ def edit_dataset(
     contributor=None,
     collection_date=None,
     language=None,
-    attributes=None,
-    data=None,
     default_target_attribute=None,
     ignore_attribute=None,
     citation=None,
@@ -839,17 +837,6 @@ def edit_dataset(
       language : str
           Language in which the data is represented.
           Starts with 1 upper case letter, rest lower case, e.g. 'English'.
-      attributes : list, dict, or 'auto'
-          A list of tuples. Each tuple consists of the attribute name and type.
-          If passing a pandas DataFrame, the attributes can be automatically
-          inferred by passing ``'auto'``. Specific attributes can be manually
-          specified by a passing a dictionary where the key is the name of the
-          attribute and the value is the data type of the attribute.
-      data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
-          An array that contains both the attributes and the targets. When
-          providing a dataframe, the attribute names and type can be inferred by
-          passing ``attributes='auto'``.
-          The target feature is indicated as meta-data of the dataset.
       default_target_attribute : str
           The default target attribute, if it exists.
           Can have multiple values, comma separated.
@@ -879,54 +866,6 @@ def edit_dataset(
     if not isinstance(data_id, int):
         raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
 
-    # case 1, changing these fields creates a new version of the dataset with changed field
-    if any(
-        field is not None
-        for field in [
-            data,
-            attributes,
-            default_target_attribute,
-            row_id_attribute,
-            ignore_attribute,
-        ]
-    ):
-        logger.warning("Creating a new version of dataset, cannot edit existing version")
-
-        # Get old dataset and features
-        dataset = get_dataset(data_id)
-        df, y, categorical, attribute_names = dataset.get_data(dataset_format="dataframe")
-        attributes_old = attributes_arff_from_df(df)
-
-        # Sparse data needs to be provided in a different format from dense data
-        if dataset.format == "sparse_arff":
-            df, y, categorical, attribute_names = dataset.get_data(dataset_format="array")
-            data_old = coo_matrix(df)
-        else:
-            data_old = df
-        data_new = data if data is not None else data_old
-        dataset_new = create_dataset(
-            name=dataset.name,
-            description=description or dataset.description,
-            creator=creator or dataset.creator,
-            contributor=contributor or dataset.contributor,
-            collection_date=collection_date or dataset.collection_date,
-            language=language or dataset.language,
-            licence=dataset.licence,
-            attributes=attributes or attributes_old,
-            data=data_new,
-            default_target_attribute=default_target_attribute or dataset.default_target_attribute,
-            ignore_attribute=ignore_attribute or dataset.ignore_attribute,
-            citation=citation or dataset.citation,
-            row_id_attribute=row_id_attribute or dataset.row_id_attribute,
-            original_data_url=original_data_url or dataset.original_data_url,
-            paper_url=paper_url or dataset.paper_url,
-            update_comment=dataset.update_comment,
-            version_label=dataset.version_label,
-        )
-        dataset_new.publish()
-        return dataset_new.dataset_id
-
-    # case 2, changing any of these fields will update existing dataset
     # compose data edit parameters as xml
     form_data = {"data_id": data_id}
     xml = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
@@ -937,6 +876,9 @@ def edit_dataset(
     xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
     xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
     xml["oml:data_edit_parameters"]["oml:language"] = language
+    xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute
+    xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute
+    xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute
     xml["oml:data_edit_parameters"]["oml:citation"] = citation
     xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
     xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index a3be7b2b7..5076d06c2 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -1341,57 +1341,43 @@ def test_get_dataset_cache_format_feather(self):
         self.assertEqual(len(attribute_names), X.shape[1])
 
     def test_data_edit(self):
-
-        # admin key for test server (only admins or owners can edit datasets).
-        # all users can edit their own datasets)
-        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
-
-        # case 1, editing description, creator, contributor, collection_date, original_data_url,
-        # paper_url, citation, language edits existing dataset.
-        did = 564
-        result = edit_dataset(
-            did,
-            description="xor dataset represents XOR operation",
-            contributor="",
-            collection_date="2019-10-29 17:06:18",
-            original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
-            paper_url="",
-            citation="kaggle",
-            language="English",
+        # Case 1
+        # All users can edit non-critical fields of datasets
+        desc = (
+            "This data sets consists of 3 different types of irises' "
+            "(Setosa, Versicolour, and Virginica) petal and sepal length,"
+            " stored in a 150x4 numpy.ndarray"
         )
-        self.assertEqual(result, did)
-
-        # case 2, editing data, attributes, default_target_attribute, row_id_attribute,
-        # ignore_attribute generates a new dataset
-
-        column_names = [
-            ("input1", "REAL"),
-            ("input2", "REAL"),
-            ("y", "REAL"),
-        ]
-        desc = "xor dataset represents XOR operation"
+        did = 128
         result = edit_dataset(
-            564,
+            did,
             description=desc,
-            contributor="",
-            collection_date="2019-10-29 17:06:18",
-            attributes=column_names,
-            original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
-            paper_url="",
-            citation="kaggle",
+            creator="R.A.Fisher",
+            collection_date="1937",
+            citation="The use of multiple measurements in taxonomic problems",
             language="English",
         )
-        self.assertNotEqual(did, result)
+        self.assertEqual(did, result)
+        edited_dataset = openml.datasets.get_dataset(did)
+        self.assertEqual(edited_dataset.description, desc)
+
+        # Case 2
+        # only owners (or admin) can edit all critical fields of datasets
+        # this is a dataset created by CI, so it is editable by this test
+        did = 315
+        result = edit_dataset(did, default_target_attribute="col_1", ignore_attribute="col_2")
+        self.assertEqual(did, result)
+        edited_dataset = openml.datasets.get_dataset(did)
+        self.assertEqual(edited_dataset.ignore_attribute, ["col_2"])
 
     def test_data_edit_errors(self):
-
-        # admin key for test server (only admins or owners can edit datasets).
-        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
         # Check server exception when no field to edit is provided
         self.assertRaisesRegex(
             OpenMLServerException,
-            "Please provide atleast one field among description, creator, contributor, "
-            "collection_date, language, citation, original_data_url or paper_url to edit.",
+            "Please provide atleast one field among description, creator, "
+            "contributor, collection_date, language, citation, "
+            "original_data_url, default_target_attribute, row_id_attribute, "
+            "ignore_attribute or paper_url to edit.",
             edit_dataset,
             data_id=564,
         )
@@ -1403,12 +1389,21 @@ def test_data_edit_errors(self):
             data_id=100000,
             description="xor operation dataset",
         )
-        # Check server exception when a non-owner or non-admin tries to edit existing dataset
-        openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f"
+        # Check server exception when owner/admin edits critical features of dataset with tasks
         self.assertRaisesRegex(
             OpenMLServerException,
-            "Dataset is not owned by you",
+            "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
+            "can only be edited for datasets without any tasks.",
             edit_dataset,
-            data_id=564,
-            description="xor data",
+            data_id=223,
+            default_target_attribute="y",
+        )
+        # Check server exception when a non-owner or non-admin tries to edit critical features
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
+            "can be edited only by the owner. Fork the dataset if changes are required.",
+            edit_dataset,
+            data_id=128,
+            default_target_attribute="y",
         )

From f8839de87e4210b9653c87b11e0a8df059dd1895 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 7 Jul 2020 22:13:08 +0200
Subject: [PATCH 23/36] Create first section: Creating Custom Flow

---
 examples/30_extended/custom_flow_tutorial.py | 96 ++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 examples/30_extended/custom_flow_tutorial.py

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
new file mode 100644
index 000000000..c72cd949b
--- /dev/null
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -0,0 +1,96 @@
+"""
+================================
+Creating and Using a Custom Flow
+================================
+
+The most convenient way to create a flow for your machine learning workflow is to generate it
+automatically as described in <>. However, there are scenarios where this is not possible, such
+as when the flow uses a framework without an extension or when the flow is described by a script.
+
+In those cases you can still create a custom flow by following the steps of this tutorial.
+As an example we will use the flows generated for the AutoML Benchmark (...),
+and also show how to link runs to the custom flow.
+"""
+
+####################################################################################################
+
+# License: BSD 3-Clause
+# .. warning:: This example uploads data. For that reason, this example
+#   connects to the test server at test.openml.org. This prevents the main
+#   server from crowding with example datasets, tasks, runs, and so on.
+from collections import OrderedDict
+
+import openml
+
+openml.config.start_using_configuration_for_example()
+
+####################################################################################################
+# 1. Defining the flow
+# ====================
+# The first step is to define all the hyperparameters of your flow.
+# Check ... for the descriptions of each variable.
+# Note that `external version` and `name` together should uniquely identify a flow.
+#
+# The AutoML Benchmark runs AutoML systems across a range of tasks.
+# We can not use the flows of the AutoML systems directly, as the benchmark adds performs
+# preprocessing as required.
+#
+# We will break down the flow parameters into several groups, for the tutorial.
+# First we will define the name and version information.
+# Make sure to leave enough information so others can determine exactly which
+# version of the package/script is used. Use tags so users can find your flow easily.
+
+general = dict(
+    name="automlbenchmark_autosklearn",
+    description=(
+        "Auto-sklearn as set up by the AutoML Benchmark"
+        "Source: https://github.com/openml/automlbenchmark/releases/tag/v0.9"
+    ),
+    external_version="amlb==0.9",
+    language="English",
+    tags=["amlb", "benchmark", "study_218"],
+    dependencies="amlb==0.9",
+)
+
+####################################################################################################
+# Next we define the flow hyperparameters. We define their name and default value in `parameters`,
+# and provide meta-data for each parameter through `parameters_meta_info`.
+# Note that the use of ordered dicts is required.
+
+flow_hyperparameters = dict(
+    parameters=OrderedDict(time="240", memory="32", cores="8"),
+    parameters_meta_info=OrderedDict(
+        cores=OrderedDict(description="number of available cores", data_type="int"),
+        memory=OrderedDict(description="memory in gigabytes", data_type="int"),
+        time=OrderedDict(description="time in minutes", data_type="int"),
+    ),
+)
+
+####################################################################################################
+# It is possible for flows to contain subflows. In this example, the auto-sklearn flow is a
+# subflow, this means that the subflow is entirely executed as part of this flow.
+# Using this modularity also allows your runs to specify which hyperparameters of the
+# subflows were used!
+#
+# Note: flow 15275 is not actually the right flow on the test server,
+# but that does not matter for this demonstration.
+
+autosklearn_flow = openml.flows.get_flow(15275)  # auto-sklearn 0.5.1
+subflow = dict(components=OrderedDict(automl_tool=autosklearn_flow),)
+
+####################################################################################################
+# With all parameters of the flow defined, we can now initialize the OpenMLFlow and publish.
+# Explicitly set the model of the flow to `None`, because we provided all the details already!
+
+autosklearn_amlb_flow = openml.flows.OpenMLFlow(
+    **general, **flow_hyperparameters, **subflow, model=None,
+)
+autosklearn_amlb_flow.publish()
+print(f"autosklearn flow created: {autosklearn_amlb_flow.flow_id}")
+# for dev purposes, since we're rerunning this often, we want to double-check no new flows are created
+
+####################################################################################################
+# 2. Using the flow
+# ====================
+# This Section will show how to upload run data for your custom flow.
+#

From 2a6903b6684355001ffdfbf4d51940be45b9428e Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 7 Jul 2020 22:27:35 +0200
Subject: [PATCH 24/36] Add Section: Using the Flow

It is incomplete as while trying to explain how to format the
predictions, I realized a utility function is required.
---
 examples/30_extended/custom_flow_tutorial.py | 41 +++++++++++++++++++-
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index c72cd949b..ca4ca9f8c 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -87,10 +87,47 @@
 )
 autosklearn_amlb_flow.publish()
 print(f"autosklearn flow created: {autosklearn_amlb_flow.flow_id}")
-# for dev purposes, since we're rerunning this often, we want to double-check no new flows are created
 
 ####################################################################################################
 # 2. Using the flow
 # ====================
 # This Section will show how to upload run data for your custom flow.
-#
+# Take care to change the values of parameters as well as the task id,
+# to reflect the actual run.
+# Task and parameter values in the example are fictional.
+
+flow_id = autosklearn_amlb_flow.flow_id
+
+parameters = [
+    OrderedDict([("oml:name", "cores"), ("oml:value", 4), ("oml:component", flow_id)]),
+    OrderedDict([("oml:name", "memory"), ("oml:value", 16), ("oml:component", flow_id)]),
+    OrderedDict([("oml:name", "time"), ("oml:value", 120), ("oml:component", flow_id)]),
+]
+
+task_id = 115
+task = openml.tasks.get_task(task_id)  # Diabetes Task
+dataset_id = task.get_dataset().dataset_id
+
+
+####################################################################################################
+# The last bit of information for the run we need are the predicted values.
+# The exact format of the predictions will depend on the task.
+# [... add later, this clearly seems too complicated to expected users to do]
+
+predictions = []  #  load_format_predictions(task_id, predictions)
+
+####################################################################################################
+# Finally we can create the OpenMLRun object and upload.
+# We use the "setup string" because the used flow was a script.
+
+benchmark_command = f"python3 runbenchmark.py auto-sklearn medium -m aws -t 119"
+my_run = openml.runs.OpenMLRun(
+    task_id=task_id,
+    flow_id=flow_id,
+    dataset_id=dataset_id,
+    parameter_settings=parameters,
+    setup_string=benchmark_command,
+    data_content=predictions,
+    tags=["study_218"],
+)
+my_run.publish()

From 48024978fa32ad0d29c3ed8bc632ec9056d93317 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 10 Jul 2020 10:53:37 +0200
Subject: [PATCH 25/36] Allow run description text to be custom

Previously the description text that accompanies the prediction file was
auto-generated with the assumption that the corresponding flow had an
extension. To support custom flows (with no extension), this behavior
had to be changed. The description can now be passed on initialization.
The description describing it was auto generated from run_task is now
correctly only added if the run was generated through run_flow_on_task.
---
 openml/runs/functions.py | 60 +++++++++++++++++++++++++++++++++++++++-
 openml/runs/run.py       | 15 +++++-----
 2 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index b3b15d16e..ba67e1a8c 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -4,6 +4,7 @@
 import io
 import itertools
 import os
+import time
 from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING  # noqa F401
 import warnings
 
@@ -250,7 +251,8 @@ def run_flow_on_task(
     )
 
     data_content, trace, fold_evaluations, sample_evaluations = res
-
+    fields = [*run_environment, time.strftime("%c"), "Created by run_flow_on_task"]
+    generated_description = "\n".join(fields)
     run = OpenMLRun(
         task_id=task.task_id,
         flow_id=flow_id,
@@ -262,6 +264,7 @@ def run_flow_on_task(
         data_content=data_content,
         flow=flow,
         setup_string=flow.extension.create_setup_string(flow.model),
+        description_text=generated_description,
     )
 
     if (upload_flow or avoid_duplicate_runs) and flow.flow_id is not None:
@@ -1004,3 +1007,58 @@ def __list_runs(api_call, output_format="dict"):
         runs = pd.DataFrame.from_dict(runs, orient="index")
 
     return runs
+
+
+def format_prediction(
+    task: OpenMLSupervisedTask,
+    repeat: int,
+    fold: int,
+    index: int,
+    prediction: Union[str, int, float],
+    truth: Union[str, int, float],
+    sample: Optional[int] = None,
+    proba: Optional[Dict[str, float]] = None,
+) -> List[Union[str, int, float]]:
+    """ Format the predictions in the specific order as required for the run results.
+
+    Parameters
+    ----------
+    task: OpenMLSupervisedTask
+        Task for which to format the predictions.
+    repeat: int
+        From which repeat this predictions is made.
+    fold: int
+        From which fold this prediction is made.
+    index: int
+        For which index this prediction is made.
+    prediction: str, int or float
+        The predicted class label or value.
+    truth: str, int or float
+        The true class label or value.
+    sample: int, optional (default=None)
+        From which sample set this prediction is made.
+        Required only for LearningCurve tasks.
+    proba: Dict[str, float], optional (default=None)
+        For classification tasks only.
+        A mapping from each class label to their predicted probability.
+        The dictionary should contain an entry for each of the `task.class_labels`.
+        E.g.: {"Iris-Setosa": 0.2, "Iris-Versicolor": 0.7, "Iris-Virginica": 0.1}
+
+    Returns
+    -------
+    A list with elements for the prediction results of a run.
+
+    """
+    if isinstance(task, OpenMLClassificationTask):
+        if proba is None:
+            raise ValueError("Predicted Class Probabilities are required for classification task")
+        if not set(task.class_labels) == set(proba):
+            raise ValueError("Each class should have a predicted probability")
+        if sample is None:
+            if isinstance(task, OpenMLLearningCurveTask):
+                raise ValueError("`sample` can not be none for LearningCurveTask")
+            else:
+                sample = 0
+        probabilities = [proba[c] for c in task.class_labels]
+        return [repeat, fold, sample, index, *probabilities, truth, prediction]
+    return [repeat, fold, index, truth, prediction]
diff --git a/openml/runs/run.py b/openml/runs/run.py
index a61fc4688..a32907156 100644
--- a/openml/runs/run.py
+++ b/openml/runs/run.py
@@ -62,6 +62,7 @@ def __init__(
         task=None,
         flow=None,
         run_id=None,
+        description_text=None,
     ):
         self.uploader = uploader
         self.uploader_name = uploader_name
@@ -87,6 +88,7 @@ def __init__(
         self.model = model
         self.tags = tags
         self.predictions_url = predictions_url
+        self.description_text = description_text
 
     @property
     def id(self) -> Optional[int]:
@@ -264,16 +266,13 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
         if self.flow is None:
             self.flow = get_flow(self.flow_id)
 
-        run_environment = (
-            self.flow.extension.get_version_information()
-            + [time.strftime("%c")]
-            + ["Created by run_task()"]
-        )
+        if self.description_text is None:
+            self.description_text = time.strftime("%c")
         task = get_task(self.task_id)
 
         arff_dict = OrderedDict()  # type: 'OrderedDict[str, Any]'
         arff_dict["data"] = self.data_content
-        arff_dict["description"] = "\n".join(run_environment)
+        arff_dict["description"] = self.description_text
         arff_dict["relation"] = "openml_task_{}_predictions".format(task.task_id)
 
         if isinstance(task, OpenMLLearningCurveTask):
@@ -485,9 +484,9 @@ def _get_file_elements(self) -> Dict:
         Derived child classes should overwrite this method as necessary.
         The description field will be populated automatically if not provided.
         """
-        if self.model is None:
+        if self.parameter_settings is None and self.model is None:
             raise PyOpenMLError(
-                "OpenMLRun obj does not contain a model. " "(This should never happen.) "
+                "OpenMLRun must contain a model or be initialized with parameter_settings."
             )
         if self.flow_id is None:
             if self.flow is None:

From 7fb64b4f1818685416fe85c6bfb3dbb9e578ac22 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 10 Jul 2020 10:57:16 +0200
Subject: [PATCH 26/36] Draft for Custom Flow tutorial

---
 examples/30_extended/custom_flow_tutorial.py | 58 ++++++++++++++++++--
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index ca4ca9f8c..33aaaf1a1 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -19,8 +19,11 @@
 #   connects to the test server at test.openml.org. This prevents the main
 #   server from crowding with example datasets, tasks, runs, and so on.
 from collections import OrderedDict
+import numpy as np
 
 import openml
+from openml import OpenMLClassificationTask
+from openml.runs.functions import format_prediction
 
 openml.config.start_using_configuration_for_example()
 
@@ -104,17 +107,62 @@
     OrderedDict([("oml:name", "time"), ("oml:value", 120), ("oml:component", flow_id)]),
 ]
 
-task_id = 115
-task = openml.tasks.get_task(task_id)  # Diabetes Task
+task_id = 1408  # Iris Task
+task = openml.tasks.get_task(task_id)
 dataset_id = task.get_dataset().dataset_id
 
 
 ####################################################################################################
 # The last bit of information for the run we need are the predicted values.
 # The exact format of the predictions will depend on the task.
-# [... add later, this clearly seems too complicated to expected users to do]
+# The predictions should always be a list of lists, each list should contain:
+# - the repeat number: for repeated evaluation strategies. (e.g. repeated cross-validation)
+# - the fold number: for cross-validation. (what should this be for holdout?)
+# - 0: this field is for backward compatibility.
+# - index: the row (of the original dataset) for which the prediction was made.
+# - p_1, ..., p_c: for each class the predicted probability of the sample
+#   belonging to that class. (no elements for regression tasks)
+#   Make sure the order of these elements follows the order of `task.class_labels`.
+# - the predicted class/value for the sample
+# - the true class/value for the sample
+#
+# Here we generated some random predictions in place.
+# You can ignore this code, or use it to better understand the formatting of the predictions.
+# Find the repeats/folds/samples for this task:
+n_repeats, n_folds, _ = task.get_split_dimensions()
+all_test_indices = [
+    (repeat, fold, 0, index)
+    for repeat in range(n_repeats)
+    for fold in range(n_folds)
+    for index in task.get_train_test_split_indices(fold, repeat)[1]
+]
 
-predictions = []  #  load_format_predictions(task_id, predictions)
+# random class probabilities (Iris has 150 samples and 3 classes):
+r = np.random.rand(150 * n_repeats, 3)
+# scale the random values so that the probabilities of each sample sum to 1:
+y_proba = r / r.sum(axis=1).reshape(-1, 1)
+y_pred = y_proba.argmax(axis=1)
+class_map = dict(zip(range(3), task.class_labels))
+y_true = ["Iris-setosa"] * 50 + ["Iris-versicolor"] * 50 + ["Iris-virginica"] * 50
+
+predictions = []
+ps = []
+
+for where, y, yp, proba in zip(all_test_indices, y_true, y_pred, y_proba):
+    predictions.append([*where, *proba, class_map[yp], y])
+    repeat, fold, sample, index = where
+
+    p = format_prediction(
+        task=task,
+        repeat=repeat,
+        fold=fold,
+        sample=sample,
+        index=index,
+        prediction=class_map[yp],
+        truth=y,
+        proba={c: pb for (c, pb) in zip(task.class_labels, proba)},
+    )
+    ps.append(p)
 
 ####################################################################################################
 # Finally we can create the OpenMLRun object and upload.
@@ -129,5 +177,7 @@
     setup_string=benchmark_command,
     data_content=predictions,
     tags=["study_218"],
+    description_text="Run generated by the Custom Flow tutorial.",
 )
 my_run.publish()
+print("run created:", my_run.run_id)

From a6f0a389d114815340311aa1905932873d0496bc Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 10 Jul 2020 17:54:54 +0200
Subject: [PATCH 27/36] Add minimal docstring to OpenMLRun

I am not for each field what the specifications are.
---
 openml/runs/run.py | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/openml/runs/run.py b/openml/runs/run.py
index a32907156..b8be9c3a3 100644
--- a/openml/runs/run.py
+++ b/openml/runs/run.py
@@ -27,14 +27,37 @@
 class OpenMLRun(OpenMLBase):
     """OpenML Run: result of running a model on an openml dataset.
 
-       Parameters
-       ----------
-       task_id : int
-           Refers to the task.
-       flow_id : int
-           Refers to the flow.
-       dataset_id: int
-           Refers to the data.
+    Parameters
+    ----------
+    task_id: int
+    flow_id: int
+    dataset_id: int
+    setup_string: str
+    output_files: Dict[str, str]
+        A dictionary that specifies where each related file can be found.
+    setup_id: int
+    tags: List[str]
+    uploader: int
+        User ID of the uploader.
+    uploader_name: str
+    evaluations: Dict
+    fold_evaluations: Dict
+    sample_evaluations: Dict
+    data_content: List[List]
+        The predictions generated from executing this run.
+    trace: OpenMLRunTrace
+    model: object
+    task_type: str
+    task_evaluation_measure: str
+    flow_name: str
+    parameter_settings: List[OrderedDict]
+    predictions_url: str
+    task: OpenMLTask
+    flow: OpenMLFlow
+    run_id: int
+    description_text: str, optional
+        Description text to add to the predictions file.
+        If left None,
     """
 
     def __init__(

From 3748ae011bc6e4e0cebdba1adb04df920fabbfe2 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 10 Jul 2020 18:11:54 +0200
Subject: [PATCH 28/36] Process code review feedback

In particular:
 - text changes
 - fetch true labels from the dataset instead
---
 examples/30_extended/custom_flow_tutorial.py | 40 ++++++++++++--------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index 33aaaf1a1..cb498a575 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -32,13 +32,13 @@
 # ====================
 # The first step is to define all the hyperparameters of your flow.
 # Check ... for the descriptions of each variable.
-# Note that `external version` and `name` together should uniquely identify a flow.
+# Note that `external version` and `name` together uniquely identify a flow.
 #
 # The AutoML Benchmark runs AutoML systems across a range of tasks.
-# We can not use the flows of the AutoML systems directly, as the benchmark adds performs
-# preprocessing as required.
+# OpenML stores Flows for each AutoML system. However, the AutoML benchmark adds
+# preprocessing to the flow, so should be described in a new flow.
 #
-# We will break down the flow parameters into several groups, for the tutorial.
+# We will break down the flow arguments into several groups, for the tutorial.
 # First we will define the name and version information.
 # Make sure to leave enough information so others can determine exactly which
 # version of the package/script is used. Use tags so users can find your flow easily.
@@ -57,7 +57,7 @@
 
 ####################################################################################################
 # Next we define the flow hyperparameters. We define their name and default value in `parameters`,
-# and provide meta-data for each parameter through `parameters_meta_info`.
+# and provide meta-data for each hyperparameter through `parameters_meta_info`.
 # Note that the use of ordered dicts is required.
 
 flow_hyperparameters = dict(
@@ -74,6 +74,7 @@
 # subflow, this means that the subflow is entirely executed as part of this flow.
 # Using this modularity also allows your runs to specify which hyperparameters of the
 # subflows were used!
+# Using a subflow is not required.
 #
 # Note: flow 15275 is not actually the right flow on the test server,
 # but that does not matter for this demonstration.
@@ -115,6 +116,7 @@
 ####################################################################################################
 # The last bit of information for the run we need are the predicted values.
 # The exact format of the predictions will depend on the task.
+#
 # The predictions should always be a list of lists, each list should contain:
 # - the repeat number: for repeated evaluation strategies. (e.g. repeated cross-validation)
 # - the fold number: for cross-validation. (what should this be for holdout?)
@@ -126,12 +128,18 @@
 # - the predicted class/value for the sample
 # - the true class/value for the sample
 #
+# When using openml-python extensions (such as through `run_model_on_task`),
+# all of this formatting is automatic.
+# Unfortunately we can not automate this procedure for custom flows,
+# which means a little additional effort is required.
+#
 # Here we generated some random predictions in place.
 # You can ignore this code, or use it to better understand the formatting of the predictions.
-# Find the repeats/folds/samples for this task:
+#
+# Find the repeats/folds for this task:
 n_repeats, n_folds, _ = task.get_split_dimensions()
 all_test_indices = [
-    (repeat, fold, 0, index)
+    (repeat, fold, index)
     for repeat in range(n_repeats)
     for fold in range(n_folds)
     for index in task.get_train_test_split_indices(fold, repeat)[1]
@@ -142,31 +150,31 @@
 # scale the random values so that the probabilities of each sample sum to 1:
 y_proba = r / r.sum(axis=1).reshape(-1, 1)
 y_pred = y_proba.argmax(axis=1)
+
 class_map = dict(zip(range(3), task.class_labels))
-y_true = ["Iris-setosa"] * 50 + ["Iris-versicolor"] * 50 + ["Iris-virginica"] * 50
+_, y_true = task.get_X_and_y()
+y_true = [class_map[y] for y in y_true]
 
+# We format the predictions with the utility function `format_prediction`.
+# It will organize the relevant data in the expected format/order.
 predictions = []
-ps = []
-
 for where, y, yp, proba in zip(all_test_indices, y_true, y_pred, y_proba):
-    predictions.append([*where, *proba, class_map[yp], y])
-    repeat, fold, sample, index = where
+    repeat, fold, index = where
 
-    p = format_prediction(
+    prediction = format_prediction(
         task=task,
         repeat=repeat,
         fold=fold,
-        sample=sample,
         index=index,
         prediction=class_map[yp],
         truth=y,
         proba={c: pb for (c, pb) in zip(task.class_labels, proba)},
     )
-    ps.append(p)
+    predictions.append(prediction)
 
 ####################################################################################################
 # Finally we can create the OpenMLRun object and upload.
-# We use the "setup string" because the used flow was a script.
+# We use the argument setup_string because the used flow was a script."
 
 benchmark_command = f"python3 runbenchmark.py auto-sklearn medium -m aws -t 119"
 my_run = openml.runs.OpenMLRun(

From 5479d7bf8de502f8cc8e21cccc0eabc132a3a383 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 10 Jul 2020 18:22:19 +0200
Subject: [PATCH 29/36] Use the format utility function in automatic runs

To format the predictions.
---
 openml/runs/functions.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index ba67e1a8c..77b803c6c 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -481,13 +481,17 @@ def _calculate_local_measure(sklearn_fn, openml_name):
 
             for i, tst_idx in enumerate(test_indices):
 
-                arff_line = [rep_no, fold_no, sample_no, tst_idx]  # type: List[Any]
                 if task.class_labels is not None:
-                    for j, class_label in enumerate(task.class_labels):
-                        arff_line.append(proba_y[i][j])
-
-                    arff_line.append(task.class_labels[pred_y[i]])
-                    arff_line.append(task.class_labels[test_y[i]])
+                    arff_line = format_prediction(
+                        task=task,
+                        repeat=rep_no,
+                        fold=fold_no,
+                        sample=sample_no,
+                        index=tst_idx,
+                        prediction=task.class_labels[pred_y[i]],
+                        truth=task.class_labels[test_y[i]],
+                        proba=dict(zip(task.class_labels, proba_y[i])),
+                    )
                 else:
                     raise ValueError("The task has no class labels")
 
@@ -501,7 +505,15 @@ def _calculate_local_measure(sklearn_fn, openml_name):
         elif isinstance(task, OpenMLRegressionTask):
 
             for i in range(0, len(test_indices)):
-                arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y[i]]
+                arff_line = format_prediction(
+                    task=task,
+                    repeat=rep_no,
+                    fold=fold_no,
+                    index=test_indices[i],
+                    prediction=pred_y[i],
+                    truth=test_y[i],
+                )
+
                 arff_datacontent.append(arff_line)
 
             if add_local_measures:

From 4b71c30d0235cad1d8d9060c08053d5f0399c638 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Mon, 13 Jul 2020 13:35:54 +0200
Subject: [PATCH 30/36] Process @mfeurer feedback

---
 examples/30_extended/custom_flow_tutorial.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index cb498a575..70c4adfb8 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -58,7 +58,8 @@
 ####################################################################################################
 # Next we define the flow hyperparameters. We define their name and default value in `parameters`,
 # and provide meta-data for each hyperparameter through `parameters_meta_info`.
-# Note that the use of ordered dicts is required.
+# Note that eventhough the argument name is `parameters` they describe the hyperparameters.
+# The use of ordered dicts is required.
 
 flow_hyperparameters = dict(
     parameters=OrderedDict(time="240", memory="32", cores="8"),
@@ -84,7 +85,13 @@
 
 ####################################################################################################
 # With all parameters of the flow defined, we can now initialize the OpenMLFlow and publish.
-# Explicitly set the model of the flow to `None`, because we provided all the details already!
+# Because we provided all the details already, we do not need to provide a `model` to the flow.
+#
+# In our case, we don't even have a model. It is possible to have a model but still require
+# to follow these steps when the model (python object) does not have an extensions from which
+# to automatically extract the hyperparameters.
+# So whether you have a model with no extension or no model at all, explicitly set
+# the model of the flow to `None`.
 
 autosklearn_amlb_flow = openml.flows.OpenMLFlow(
     **general, **flow_hyperparameters, **subflow, model=None,
@@ -174,7 +181,7 @@
 
 ####################################################################################################
 # Finally we can create the OpenMLRun object and upload.
-# We use the argument setup_string because the used flow was a script."
+# We use the argument setup_string because the used flow was a script.
 
 benchmark_command = f"python3 runbenchmark.py auto-sklearn medium -m aws -t 119"
 my_run = openml.runs.OpenMLRun(

From 942d66e960168a2d355ba1538398718b9a2f5f49 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 12 Aug 2020 18:56:50 +0200
Subject: [PATCH 31/36] Update subflow paragraph

---
 examples/30_extended/custom_flow_tutorial.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index 70c4adfb8..e0ee6a21c 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -71,10 +71,14 @@
 )
 
 ####################################################################################################
-# It is possible for flows to contain subflows. In this example, the auto-sklearn flow is a
-# subflow, this means that the subflow is entirely executed as part of this flow.
-# Using this modularity also allows your runs to specify which hyperparameters of the
-# subflows were used!
+# It is possible to build a flow which uses other flows.
+# For example, the Random Forest Classifier is a flow, but you could also construct a flow
+# which uses a Random Forest Classifier in a ML pipeline. When constructing the pipeline flow,
+# you can use the Random Forest Classifier flow as a *subflow*. It allows for
+# all hyperparameters of the Random Classifier Flow to also be specified in your pipeline flow.
+#
+# In this example, the auto-sklearn flow is a subflow: the auto-sklearn flow is entirely executed as part of this flow.
+# This allows people to specify auto-sklearn hyperparameters used in this flow.
 # Using a subflow is not required.
 #
 # Note: flow 15275 is not actually the right flow on the test server,

From 9ba363ec6e60a7bf700d6c2b8bd7d97ed03f3089 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 14 Aug 2020 14:38:21 +0200
Subject: [PATCH 32/36] Check the ClassificationTask has class label set

---
 openml/runs/functions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 77b803c6c..ef8880b23 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -1064,6 +1064,8 @@ def format_prediction(
     if isinstance(task, OpenMLClassificationTask):
         if proba is None:
             raise ValueError("Predicted Class Probabilities are required for classification task")
+        if task.class_labels is None:
+            raise ValueError("The classification task must have class labels set")
         if not set(task.class_labels) == set(proba):
             raise ValueError("Each class should have a predicted probability")
         if sample is None:

From a72053d4dbcbfca3c0ccf67fc5cb5a8bc5a87603 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Mon, 17 Aug 2020 10:38:31 +0200
Subject: [PATCH 33/36] Test task is of supported type

---
 openml/runs/functions.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index ef8880b23..0dfc68d60 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -830,7 +830,7 @@ def list_runs(
     study: Optional[int] = None,
     display_errors: bool = False,
     output_format: str = "dict",
-    **kwargs
+    **kwargs,
 ) -> Union[Dict, pd.DataFrame]:
     """
     List all runs matching all of the given filters.
@@ -902,7 +902,7 @@ def list_runs(
         tag=tag,
         study=study,
         display_errors=display_errors,
-        **kwargs
+        **kwargs,
     )
 
 
@@ -915,7 +915,7 @@ def _list_runs(
     study: Optional[int] = None,
     display_errors: bool = False,
     output_format: str = "dict",
-    **kwargs
+    **kwargs,
 ) -> Union[Dict, pd.DataFrame]:
     """
     Perform API call `/run/list/{filters}'
@@ -1075,4 +1075,7 @@ def format_prediction(
                 sample = 0
         probabilities = [proba[c] for c in task.class_labels]
         return [repeat, fold, sample, index, *probabilities, truth, prediction]
-    return [repeat, fold, index, truth, prediction]
+    elif isinstance(task, OpenMLRegressionTask):
+        return [repeat, fold, index, truth, prediction]
+    else:
+        raise TypeError(f"Formatting for {type(task)} is not supported.")

From de3149057250c38d530cba4408cfe0738249caf6 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Mon, 17 Aug 2020 10:42:28 +0200
Subject: [PATCH 34/36] Add tests for format_prediction

---
 tests/test_runs/test_run_functions.py | 46 ++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index aca9580c9..86126f306 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -22,10 +22,7 @@
 
 import openml.extensions.sklearn
 from openml.testing import TestBase, SimpleImputer
-from openml.runs.functions import (
-    _run_task_get_arffcontent,
-    run_exists,
-)
+from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction
 from openml.runs.trace import OpenMLRunTrace
 from openml.tasks import TaskTypeEnum
 
@@ -1342,3 +1339,44 @@ def test_run_flow_on_task_downloaded_flow(self):
         run.publish()
         TestBase._mark_entity_for_removal("run", run.run_id)
         TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], run.run_id))
+
+    def test_format_prediction_non_supervised(self):
+        # non-supervised tasks don't exist on the test server
+        openml.config.server = self.production_server
+        clustering = openml.tasks.get_task(126033, download_data=False)
+        ignored_input = [0] * 5
+        with self.assertRaises(TypeError):
+            format_prediction(clustering, *ignored_input)
+
+    def test_format_prediction_classification_no_probabilities(self):
+        classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
+        ignored_input = [0] * 5
+        with self.assertRaises(ValueError):
+            format_prediction(classification, *ignored_input, proba=None)
+
+    def test_format_prediction_classification_incomplete_probabilities(self):
+        classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
+        ignored_input = [0] * 5
+        incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]}
+        with self.assertRaises(ValueError):
+            format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
+
+    def test_format_prediction_task_without_classlabels_set(self):
+        classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
+        classification.class_labels = None
+        ignored_input = [0] * 5
+        with self.assertRaises(ValueError):
+            format_prediction(classification, *ignored_input, proba={})
+
+    def test_format_prediction_task_learning_curve_sample_not_set(self):
+        learning_curve = openml.tasks.get_task(801, download_data=False)
+        probabilities = {c: 0.2 for c in learning_curve.class_labels}
+        ignored_input = [0] * 5
+        with self.assertRaises(ValueError):
+            format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
+
+    def test_format_prediction_task_regression(self):
+        regression = openml.tasks.get_task(self.TEST_SERVER_TASK_REGRESSION[0], download_data=False)
+        ignored_input = [0] * 5
+        res = format_prediction(regression, *ignored_input)
+        self.assertListEqual(res, [0] * 5)

From 832f437336251e735b40b8b8f5cdb95d83cbd743 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 25 Aug 2020 14:21:59 +0200
Subject: [PATCH 35/36] Process feedback Neeratyoy

---
 examples/30_extended/custom_flow_tutorial.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index e0ee6a21c..737c67c75 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -4,7 +4,8 @@
 ================================
 
 The most convenient way to create a flow for your machine learning workflow is to generate it
-automatically as described in <>. However, there are scenarios where this is not possible, such
+automatically as described in the `Obtain Flow IDs <https://openml.github.io/openml-python/master/examples/30_extended/flow_id_tutorial.html#sphx-glr-examples-30-extended-flow-id-tutorial-py>`_ tutorial.  # noqa E501
+However, there are scenarios where this is not possible, such
 as when the flow uses a framework without an extension or when the flow is described by a script.
 
 In those cases you can still create a custom flow by following the steps of this tutorial.
@@ -58,7 +59,7 @@
 ####################################################################################################
 # Next we define the flow hyperparameters. We define their name and default value in `parameters`,
 # and provide meta-data for each hyperparameter through `parameters_meta_info`.
-# Note that eventhough the argument name is `parameters` they describe the hyperparameters.
+# Note that even though the argument name is `parameters` they describe the hyperparameters.
 # The use of ordered dicts is required.
 
 flow_hyperparameters = dict(
@@ -79,7 +80,7 @@
 #
 # In this example, the auto-sklearn flow is a subflow: the auto-sklearn flow is entirely executed as part of this flow.
 # This allows people to specify auto-sklearn hyperparameters used in this flow.
-# Using a subflow is not required.
+# In general, using a subflow is not required.
 #
 # Note: flow 15275 is not actually the right flow on the test server,
 # but that does not matter for this demonstration.

From 3cc74de6f90a8e69b7bd396b1a547a7c007de4e7 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 28 Aug 2020 17:05:24 +0200
Subject: [PATCH 36/36] Test Exception with Regex

Also throw NotImplementedError instead of TypeError for unsupported task
types. Added links in the example.
---
 examples/30_extended/custom_flow_tutorial.py |  6 ++++--
 openml/runs/functions.py                     |  4 ++--
 tests/test_runs/test_run_functions.py        | 14 +++++++++-----
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
index 737c67c75..3b918e108 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_tutorial.py
@@ -9,7 +9,7 @@
 as when the flow uses a framework without an extension or when the flow is described by a script.
 
 In those cases you can still create a custom flow by following the steps of this tutorial.
-As an example we will use the flows generated for the AutoML Benchmark (...),
+As an example we will use the flows generated for the `AutoML Benchmark <https://openml.github.io/automlbenchmark/>`_,
 and also show how to link runs to the custom flow.
 """
 
@@ -32,7 +32,7 @@
 # 1. Defining the flow
 # ====================
 # The first step is to define all the hyperparameters of your flow.
-# Check ... for the descriptions of each variable.
+# The API pages feature a descriptions of each variable of the `OpenMLFlow <https://openml.github.io/openml-python/master/generated/openml.OpenMLFlow.html#openml.OpenMLFlow>`_.  # noqa E501
 # Note that `external version` and `name` together uniquely identify a flow.
 #
 # The AutoML Benchmark runs AutoML systems across a range of tasks.
@@ -201,3 +201,5 @@
 )
 my_run.publish()
 print("run created:", my_run.run_id)
+
+openml.config.stop_using_configuration_for_example()
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 0dfc68d60..a3888d3a1 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -1063,7 +1063,7 @@ def format_prediction(
     """
     if isinstance(task, OpenMLClassificationTask):
         if proba is None:
-            raise ValueError("Predicted Class Probabilities are required for classification task")
+            raise ValueError("`proba` is required for classification task")
         if task.class_labels is None:
             raise ValueError("The classification task must have class labels set")
         if not set(task.class_labels) == set(proba):
@@ -1078,4 +1078,4 @@ def format_prediction(
     elif isinstance(task, OpenMLRegressionTask):
         return [repeat, fold, index, truth, prediction]
     else:
-        raise TypeError(f"Formatting for {type(task)} is not supported.")
+        raise NotImplementedError(f"Formatting for {type(task)} is not supported.")
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 86126f306..fc53ea366 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1345,34 +1345,38 @@ def test_format_prediction_non_supervised(self):
         openml.config.server = self.production_server
         clustering = openml.tasks.get_task(126033, download_data=False)
         ignored_input = [0] * 5
-        with self.assertRaises(TypeError):
+        with self.assertRaisesRegex(
+            NotImplementedError, r"Formatting for <class '[\w.]+'> is not supported."
+        ):
             format_prediction(clustering, *ignored_input)
 
     def test_format_prediction_classification_no_probabilities(self):
         classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
         ignored_input = [0] * 5
-        with self.assertRaises(ValueError):
+        with self.assertRaisesRegex(ValueError, "`proba` is required for classification task"):
             format_prediction(classification, *ignored_input, proba=None)
 
     def test_format_prediction_classification_incomplete_probabilities(self):
         classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
         ignored_input = [0] * 5
         incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]}
-        with self.assertRaises(ValueError):
+        with self.assertRaisesRegex(ValueError, "Each class should have a predicted probability"):
             format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
 
     def test_format_prediction_task_without_classlabels_set(self):
         classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
         classification.class_labels = None
         ignored_input = [0] * 5
-        with self.assertRaises(ValueError):
+        with self.assertRaisesRegex(
+            ValueError, "The classification task must have class labels set"
+        ):
             format_prediction(classification, *ignored_input, proba={})
 
     def test_format_prediction_task_learning_curve_sample_not_set(self):
         learning_curve = openml.tasks.get_task(801, download_data=False)
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
         ignored_input = [0] * 5
-        with self.assertRaises(ValueError):
+        with self.assertRaisesRegex(ValueError, "`sample` can not be none for LearningCurveTask"):
             format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
 
     def test_format_prediction_task_regression(self):