openml · mfeurer · Sep 2, 2020 · Jul 7, 2020 · Jul 7, 2020 · Jul 10, 2020
diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
@@ -32,13 +32,13 @@
 # ====================
 # The first step is to define all the hyperparameters of your flow.
 # Check ... for the descriptions of each variable.
-# Note that `external version` and `name` together should uniquely identify a flow.
+# Note that `external version` and `name` together uniquely identify a flow.
 #
 # The AutoML Benchmark runs AutoML systems across a range of tasks.
-# We can not use the flows of the AutoML systems directly, as the benchmark adds performs
-# preprocessing as required.
+# OpenML stores Flows for each AutoML system. However, the AutoML benchmark adds
+# preprocessing to the flow, so should be described in a new flow.
 #
-# We will break down the flow parameters into several groups, for the tutorial.
+# We will break down the flow arguments into several groups, for the tutorial.
 # First we will define the name and version information.
 # Make sure to leave enough information so others can determine exactly which
 # version of the package/script is used. Use tags so users can find your flow easily.
@@ -57,7 +57,7 @@
 
 ####################################################################################################
 # Next we define the flow hyperparameters. We define their name and default value in `parameters`,
-# and provide meta-data for each parameter through `parameters_meta_info`.
+# and provide meta-data for each hyperparameter through `parameters_meta_info`.
 # Note that the use of ordered dicts is required.
 
 flow_hyperparameters = dict(
@@ -74,6 +74,7 @@
 # subflow, this means that the subflow is entirely executed as part of this flow.
 # Using this modularity also allows your runs to specify which hyperparameters of the
 # subflows were used!
+# Using a subflow is not required.
 #
 # Note: flow 15275 is not actually the right flow on the test server,
 # but that does not matter for this demonstration.
@@ -115,6 +116,7 @@
 ####################################################################################################
 # The last bit of information for the run we need are the predicted values.
 # The exact format of the predictions will depend on the task.
+#
 # The predictions should always be a list of lists, each list should contain:
 # - the repeat number: for repeated evaluation strategies. (e.g. repeated cross-validation)
 # - the fold number: for cross-validation. (what should this be for holdout?)
@@ -126,12 +128,18 @@
 # - the predicted class/value for the sample
 # - the true class/value for the sample
 #
+# When using openml-python extensions (such as through `run_model_on_task`),
+# all of this formatting is automatic.
+# Unfortunately we can not automate this procedure for custom flows,
+# which means a little additional effort is required.
+#
 # Here we generated some random predictions in place.
 # You can ignore this code, or use it to better understand the formatting of the predictions.
-# Find the repeats/folds/samples for this task:
+#
+# Find the repeats/folds for this task:
 n_repeats, n_folds, _ = task.get_split_dimensions()
 all_test_indices = [
-    (repeat, fold, 0, index)
+    (repeat, fold, index)
     for repeat in range(n_repeats)
     for fold in range(n_folds)
     for index in task.get_train_test_split_indices(fold, repeat)[1]
@@ -142,31 +150,31 @@
 # scale the random values so that the probabilities of each sample sum to 1:
 y_proba = r / r.sum(axis=1).reshape(-1, 1)
 y_pred = y_proba.argmax(axis=1)
+
 class_map = dict(zip(range(3), task.class_labels))
-y_true = ["Iris-setosa"] * 50 + ["Iris-versicolor"] * 50 + ["Iris-virginica"] * 50
+_, y_true = task.get_X_and_y()
+y_true = [class_map[y] for y in y_true]
 
+# We format the predictions with the utility function `format_prediction`.
+# It will organize the relevant data in the expected format/order.
 predictions = []
-ps = []
-
 for where, y, yp, proba in zip(all_test_indices, y_true, y_pred, y_proba):
-    predictions.append([*where, *proba, class_map[yp], y])
-    repeat, fold, sample, index = where
+    repeat, fold, index = where
 
-    p = format_prediction(
+    prediction = format_prediction(
         task=task,
         repeat=repeat,
         fold=fold,
-        sample=sample,
         index=index,
         prediction=class_map[yp],
         truth=y,
         proba={c: pb for (c, pb) in zip(task.class_labels, proba)},
     )
-    ps.append(p)
+    predictions.append(prediction)
 
 ####################################################################################################
 # Finally we can create the OpenMLRun object and upload.
-# We use the "setup string" because the used flow was a script.
+# We use the argument setup_string because the used flow was a script."
 
 benchmark_command = f"python3 runbenchmark.py auto-sklearn medium -m aws -t 119"
 my_run = openml.runs.OpenMLRun(

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -481,13 +481,17 @@ def _calculate_local_measure(sklearn_fn, openml_name):
 
             for i, tst_idx in enumerate(test_indices):
 
-                arff_line = [rep_no, fold_no, sample_no, tst_idx]  # type: List[Any]
                 if task.class_labels is not None:
-                    for j, class_label in enumerate(task.class_labels):
-                        arff_line.append(proba_y[i][j])
-
-                    arff_line.append(task.class_labels[pred_y[i]])
-                    arff_line.append(task.class_labels[test_y[i]])
+                    arff_line = format_prediction(
+                        task=task,
+                        repeat=rep_no,
+                        fold=fold_no,
+                        sample=sample_no,
+                        index=tst_idx,
+                        prediction=task.class_labels[pred_y[i]],
+                        truth=task.class_labels[test_y[i]],
+                        proba=dict(zip(task.class_labels, proba_y[i])),
+                    )
                 else:
                     raise ValueError("The task has no class labels")
 
@@ -501,7 +505,15 @@ def _calculate_local_measure(sklearn_fn, openml_name):
         elif isinstance(task, OpenMLRegressionTask):
 
             for i in range(0, len(test_indices)):
-                arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y[i]]
+                arff_line = format_prediction(
+                    task=task,
+                    repeat=rep_no,
+                    fold=fold_no,
+                    index=test_indices[i],
+                    prediction=pred_y[i],
+                    truth=test_y[i],
+                )
+
                 arff_datacontent.append(arff_line)
 
             if add_local_measures:

diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -27,14 +27,37 @@
 class OpenMLRun(OpenMLBase):
     """OpenML Run: result of running a model on an openml dataset.
 
-       Parameters
-       ----------
-       task_id : int
-           Refers to the task.
-       flow_id : int
-           Refers to the flow.
-       dataset_id: int
-           Refers to the data.
+    Parameters
+    ----------
+    task_id: int
+    flow_id: int
+    dataset_id: int
+    setup_string: str
+    output_files: Dict[str, str]
+        A dictionary that specifies where each related file can be found.
+    setup_id: int
+    tags: List[str]
+    uploader: int
+        User ID of the uploader.
+    uploader_name: str
+    evaluations: Dict
+    fold_evaluations: Dict
+    sample_evaluations: Dict
+    data_content: List[List]
+        The predictions generated from executing this run.
+    trace: OpenMLRunTrace
+    model: object
+    task_type: str
+    task_evaluation_measure: str
+    flow_name: str
+    parameter_settings: List[OrderedDict]
+    predictions_url: str
+    task: OpenMLTask
+    flow: OpenMLFlow
+    run_id: int
+    description_text: str, optional
+        Description text to add to the predictions file.
+        If left None,
     """
 
     def __init__(