diff --git a/doc/progress.rst b/doc/progress.rst index 976c5c750..ef5ed6bae 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -8,7 +8,7 @@ Changelog 0.11.0 ~~~~~~ - +* ADD #929: Add data edit API * FIX #873: Fixes an issue which resulted in incorrect URLs when printing OpenML objects after switching the server. * FIX #885: Logger no longer registered by default. Added utility functions to easily register diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py index d7971d0f1..40b35bbea 100644 --- a/examples/30_extended/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -5,12 +5,13 @@ How to list and download datasets. """ -############################################################################ +"" # License: BSD 3-Clauses import openml import pandas as pd +from openml.datasets.functions import edit_dataset, get_dataset ############################################################################ # Exercise 0 @@ -42,9 +43,9 @@ # * Find a dataset called 'eeg_eye_state'. # * Find all datasets with more than 50 classes. datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20) -############################################################################ +"" datalist.query('name == "eeg-eye-state"') -############################################################################ +"" datalist.query("NumberOfClasses > 50") ############################################################################ @@ -108,3 +109,39 @@ alpha=0.8, cmap="plasma", ) + + +############################################################################ +# Edit a created dataset +# ================================================= +# This example uses the test server, to avoid editing a dataset on the main server. +openml.config.start_using_configuration_for_example() +############################################################################ +# Changes to these field edits existing version: allowed only for dataset owner +data_id = edit_dataset( + 564, + description="xor dataset represents XOR operation", + contributor="", + collection_date="2019-10-29 17:06:18", + original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor", + paper_url="", + citation="kaggle", + language="English", +) +edited_dataset = get_dataset(data_id) +print(f"Edited dataset ID: {data_id}") + + +############################################################################ +# Changes to these fields: attributes, default_target_attribute, +# row_id_attribute, ignore_attribute generates a new edited version: allowed for anyone + +new_attributes = [ + ("x0", "REAL"), + ("x1", "REAL"), + ("y", "REAL"), +] +data_id = edit_dataset(564, attributes=new_attributes) +print(f"Edited dataset ID: {data_id}") + +openml.config.stop_using_configuration_for_example() diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 79fa82867..4446f0e90 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -799,6 +799,154 @@ def status_update(data_id, status): raise ValueError("Data id/status does not collide") +def edit_dataset( + data_id, + description=None, + creator=None, + contributor=None, + collection_date=None, + language=None, + attributes=None, + data=None, + default_target_attribute=None, + ignore_attribute=None, + citation=None, + row_id_attribute=None, + original_data_url=None, + paper_url=None, +) -> int: + """ + Edits an OpenMLDataset. + Specify atleast one field to edit, apart from data_id + - For certain fields, a new dataset version is created : attributes, data, + default_target_attribute, ignore_attribute, row_id_attribute. + + - For other fields, the uploader can edit the exisiting version. + Noone except the uploader can edit the exisitng version. + + Parameters + ---------- + data_id : int + ID of the dataset. + description : str + Description of the dataset. + creator : str + The person who created the dataset. + contributor : str + People who contributed to the current version of the dataset. + collection_date : str + The date the data was originally collected, given by the uploader. + language : str + Language in which the data is represented. + Starts with 1 upper case letter, rest lower case, e.g. 'English'. + attributes : list, dict, or 'auto' + A list of tuples. Each tuple consists of the attribute name and type. + If passing a pandas DataFrame, the attributes can be automatically + inferred by passing ``'auto'``. Specific attributes can be manually + specified by a passing a dictionary where the key is the name of the + attribute and the value is the data type of the attribute. + data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features) + An array that contains both the attributes and the targets. When + providing a dataframe, the attribute names and type can be inferred by + passing ``attributes='auto'``. + The target feature is indicated as meta-data of the dataset. + default_target_attribute : str + The default target attribute, if it exists. + Can have multiple values, comma separated. + ignore_attribute : str | list + Attributes that should be excluded in modelling, + such as identifiers and indexes. + citation : str + Reference(s) that should be cited when building on this data. + row_id_attribute : str, optional + The attribute that represents the row-id column, if present in the + dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not + specified, the index of the dataframe will be used as the + ``row_id_attribute``. If the name of the index is ``None``, it will + be discarded. + + .. versionadded: 0.8 + Inference of ``row_id_attribute`` from a dataframe. + original_data_url : str, optional + For derived data, the url to the original dataset. + paper_url : str, optional + Link to a paper describing the dataset. + + + Returns + ------- + data_id of the existing edited version or the new version created and published""" + if not isinstance(data_id, int): + raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id))) + + # case 1, changing these fields creates a new version of the dataset with changed field + if any( + field is not None + for field in [ + data, + attributes, + default_target_attribute, + row_id_attribute, + ignore_attribute, + ] + ): + logger.warning("Creating a new version of dataset, cannot edit existing version") + dataset = get_dataset(data_id) + + decoded_arff = dataset._get_arff(format="arff") + data_old = decoded_arff["data"] + data_new = data if data is not None else data_old + dataset_new = create_dataset( + name=dataset.name, + description=description or dataset.description, + creator=creator or dataset.creator, + contributor=contributor or dataset.contributor, + collection_date=collection_date or dataset.collection_date, + language=language or dataset.language, + licence=dataset.licence, + attributes=attributes or decoded_arff["attributes"], + data=data_new, + default_target_attribute=default_target_attribute or dataset.default_target_attribute, + ignore_attribute=ignore_attribute or dataset.ignore_attribute, + citation=citation or dataset.citation, + row_id_attribute=row_id_attribute or dataset.row_id_attribute, + original_data_url=original_data_url or dataset.original_data_url, + paper_url=paper_url or dataset.paper_url, + update_comment=dataset.update_comment, + version_label=dataset.version_label, + ) + dataset_new.publish() + return dataset_new.dataset_id + + # case 2, changing any of these fields will update existing dataset + # compose data edit parameters as xml + form_data = {"data_id": data_id} + xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' + xml["oml:data_edit_parameters"] = OrderedDict() + xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml" + xml["oml:data_edit_parameters"]["oml:description"] = description + xml["oml:data_edit_parameters"]["oml:creator"] = creator + xml["oml:data_edit_parameters"]["oml:contributor"] = contributor + xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date + xml["oml:data_edit_parameters"]["oml:language"] = language + xml["oml:data_edit_parameters"]["oml:citation"] = citation + xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url + xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url + + # delete None inputs + for k in list(xml["oml:data_edit_parameters"]): + if not xml["oml:data_edit_parameters"][k]: + del xml["oml:data_edit_parameters"][k] + + file_elements = {"edit_parameters": ("description.xml", xmltodict.unparse(xml))} + result_xml = openml._api_calls._perform_api_call( + "data/edit", "post", data=form_data, file_elements=file_elements + ) + result = xmltodict.parse(result_xml) + data_id = result["oml:data_edit"]["oml:id"] + return int(data_id) + + def _get_dataset_description(did_cache_dir, dataset_id): """Get the dataset description as xml dictionary. diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 958d28d94..c196ea36e 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -16,11 +16,17 @@ import openml from openml import OpenMLDataset -from openml.exceptions import OpenMLCacheException, OpenMLHashException, OpenMLPrivateDatasetError +from openml.exceptions import ( + OpenMLCacheException, + OpenMLHashException, + OpenMLPrivateDatasetError, + OpenMLServerException, +) from openml.testing import TestBase from openml.utils import _tag_entity, _create_cache_directory_for_id from openml.datasets.functions import ( create_dataset, + edit_dataset, attributes_arff_from_df, _get_cached_dataset, _get_cached_dataset_features, @@ -1331,3 +1337,76 @@ def test_get_dataset_cache_format_feather(self): self.assertEqual(X.shape, (150, 5)) self.assertEqual(len(categorical), X.shape[1]) self.assertEqual(len(attribute_names), X.shape[1]) + + def test_data_edit(self): + + # admin key for test server (only admins or owners can edit datasets). + # all users can edit their own datasets) + openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3" + + # case 1, editing description, creator, contributor, collection_date, original_data_url, + # paper_url, citation, language edits existing dataset. + did = 564 + result = edit_dataset( + did, + description="xor dataset represents XOR operation", + contributor="", + collection_date="2019-10-29 17:06:18", + original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor", + paper_url="", + citation="kaggle", + language="English", + ) + self.assertEqual(result, did) + + # case 2, editing data, attributes, default_target_attribute, row_id_attribute, + # ignore_attribute generates a new dataset + + column_names = [ + ("input1", "REAL"), + ("input2", "REAL"), + ("y", "REAL"), + ] + desc = "xor dataset represents XOR operation" + result = edit_dataset( + 564, + description=desc, + contributor="", + collection_date="2019-10-29 17:06:18", + attributes=column_names, + original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor", + paper_url="", + citation="kaggle", + language="English", + ) + self.assertNotEqual(did, result) + + def test_data_edit_errors(self): + + # admin key for test server (only admins or owners can edit datasets). + openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3" + # Check server exception when no field to edit is provided + self.assertRaisesRegex( + OpenMLServerException, + "Please provide atleast one field among description, creator, contributor, " + "collection_date, language, citation, original_data_url or paper_url to edit.", + edit_dataset, + data_id=564, + ) + # Check server exception when unknown dataset is provided + self.assertRaisesRegex( + OpenMLServerException, + "Unknown dataset", + edit_dataset, + data_id=100000, + description="xor operation dataset", + ) + # Check server exception when a non-owner or non-admin tries to edit existing dataset + openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f" + self.assertRaisesRegex( + OpenMLServerException, + "Dataset is not owned by you", + edit_dataset, + data_id=564, + description="xor data", + )