openml · PGijsbers · Jul 23, 2020 · Jul 17, 2020 · Jul 21, 2020 · Jul 22, 2020
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -799,6 +799,154 @@ def status_update(data_id, status):
         raise ValueError("Data id/status does not collide")
 
 
+def edit_dataset(
+    data_id,
+    description=None,
+    creator=None,
+    contributor=None,
+    collection_date=None,
+    language=None,
+    attributes=None,
+    data=None,
+    default_target_attribute=None,
+    ignore_attribute=None,
+    citation=None,
+    row_id_attribute=None,
+    original_data_url=None,
+    paper_url=None,
+):
+    """
+      Edits an OpenMLDataset.
+      Specify atleast one field to edit, apart from data_id
+       - For certain fields, a new dataset version is created : attributes, data,
+       default_target_attribute, ignore_attribute, row_id_attribute.
+
+       - For other fields, the uploader can edit the exisiting version.
+        Noone except the uploader can edit the exisitng version.
+
+      Parameters
+      ----------
+      data_id : int
+          ID of the dataset.
+      description : str
+          Description of the dataset.
+      creator : str
+          The person who created the dataset.
+      contributor : str
+          People who contributed to the current version of the dataset.
+      collection_date : str
+          The date the data was originally collected, given by the uploader.
+      language : str
+          Language in which the data is represented.
+          Starts with 1 upper case letter, rest lower case, e.g. 'English'.
+      attributes : list, dict, or 'auto'
+          A list of tuples. Each tuple consists of the attribute name and type.
+          If passing a pandas DataFrame, the attributes can be automatically
+          inferred by passing ``'auto'``. Specific attributes can be manually
+          specified by a passing a dictionary where the key is the name of the
+          attribute and the value is the data type of the attribute.
+      data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
+          An array that contains both the attributes and the targets. When
+          providing a dataframe, the attribute names and type can be inferred by
+          passing ``attributes='auto'``.
+          The target feature is indicated as meta-data of the dataset.
+      default_target_attribute : str
+          The default target attribute, if it exists.
+          Can have multiple values, comma separated.
+      ignore_attribute : str | list
+          Attributes that should be excluded in modelling,
+          such as identifiers and indexes.
+      citation : str
+          Reference(s) that should be cited when building on this data.
+      row_id_attribute : str, optional
+          The attribute that represents the row-id column, if present in the
+          dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
+          specified, the index of the dataframe will be used as the
+          ``row_id_attribute``. If the name of the index is ``None``, it will
+          be discarded.
+
+          .. versionadded: 0.8
+              Inference of ``row_id_attribute`` from a dataframe.
+      original_data_url : str, optional
+          For derived data, the url to the original dataset.
+      paper_url : str, optional
+          Link to a paper describing the dataset.
+
+
+      Returns
+      -------
+      data_id of the existing edited version or the new version created and published"""
+    if not isinstance(data_id, int):
+        raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
+
+    # case 1, changing these fields creates a new version of the dataset with changed field
+    if any(
+        field is not None
+        for field in [
+            data,
+            attributes,
+            default_target_attribute,
+            row_id_attribute,
+            ignore_attribute,
+        ]
+    ):
+        logger.warning("Creating a new version of dataset, cannot edit existing version")
+        dataset = get_dataset(data_id)
+
+        decoded_arff = dataset._get_arff(format="arff")
+        data_old = decoded_arff["data"]
+        data_new = data if data is not None else data_old
+        dataset_new = create_dataset(
+            name=dataset.name,
+            description=description or dataset.description,
+            creator=creator or dataset.creator,
+            contributor=contributor or dataset.contributor,
+            collection_date=collection_date or dataset.collection_date,
+            language=language or dataset.language,
+            licence=dataset.licence,
+            attributes=attributes or decoded_arff["attributes"],
+            data=data_new,
+            default_target_attribute=default_target_attribute or dataset.default_target_attribute,
+            ignore_attribute=ignore_attribute or dataset.ignore_attribute,
+            citation=citation or dataset.citation,
+            row_id_attribute=row_id_attribute or dataset.row_id_attribute,
+            original_data_url=original_data_url or dataset.original_data_url,
+            paper_url=paper_url or dataset.paper_url,
+            update_comment=dataset.update_comment,
+            version_label=dataset.version_label,
+        )
+        dataset_new.publish()
+        return dataset_new.dataset_id
+
+    # case 2, changing any of these fields will update existing dataset
+    # compose data edit parameters as xml
+    form_data = {"data_id": data_id}
+    xml = OrderedDict()
+    xml["oml:data_edit_parameters"] = OrderedDict()
+    xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
+    xml["oml:data_edit_parameters"]["oml:description"] = description
+    xml["oml:data_edit_parameters"]["oml:creator"] = creator
+    xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
+    xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
+    xml["oml:data_edit_parameters"]["oml:language"] = language
+    xml["oml:data_edit_parameters"]["oml:citation"] = citation
+    xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
+    xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
+
+    # delete None inputs
+    for k in list(xml["oml:data_edit_parameters"]):
+        if not xml["oml:data_edit_parameters"][k]:
+            del xml["oml:data_edit_parameters"][k]
+
+    file_elements = {"edit_parameters": ("description.xml", xmltodict.unparse(xml))}
+    result_xml = openml._api_calls._perform_api_call(
+        "data/edit", "post", data=form_data, file_elements=file_elements
+    )
+    result = xmltodict.parse(result_xml)
+    data_id = result["oml:data_edit"]["oml:id"]
+    return int(data_id)
+
+
 def _get_dataset_description(did_cache_dir, dataset_id):
     """Get the dataset description as xml dictionary.
 

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -21,6 +21,7 @@
 from openml.utils import _tag_entity, _create_cache_directory_for_id
 from openml.datasets.functions import (
     create_dataset,
+    edit_dataset,
     attributes_arff_from_df,
     _get_cached_dataset,
     _get_cached_dataset_features,
@@ -1331,3 +1332,44 @@ def test_get_dataset_cache_format_feather(self):
         self.assertEqual(X.shape, (150, 5))
         self.assertEqual(len(categorical), X.shape[1])
         self.assertEqual(len(attribute_names), X.shape[1])
+
+    def test_data_edit(self):
+
+        # admin key for test server (only admins or owners can edit datasets).
+        # all users can edit their own datasets)
+        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
+
+        # case 1, existing version edit
+        did = 564
+        result = edit_dataset(
+            did,
+            description="xor dataset represents XOR operation",
+            contributor="",
+            collection_date="2019-10-29 17:06:18",
+            original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
+            paper_url="",
+            citation="kaggle",
+            language="English",
+        )
+        self.assertEqual(result, did)
+
+        # case 2, new version edit
+
+        column_names = [
+            ("input1", "REAL"),
+            ("input2", "REAL"),
+            ("y", "REAL"),
+        ]
+        desc = "xor dataset represents XOR operation"
+        result = edit_dataset(
+            564,
+            description=desc,
+            contributor="",
+            collection_date="2019-10-29 17:06:18",
+            attributes=column_names,
+            original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
+            paper_url="",
+            citation="kaggle",
+            language="English",
+        )
+        self.assertNotEqual(did, result)