From 12b67f1542a2f559be0593ac8e349bd873a5dff7 Mon Sep 17 00:00:00 2001 From: sahithyaravi1493 Date: Wed, 19 Aug 2020 09:59:38 +0200 Subject: [PATCH 1/4] change edit_api to reflect server --- openml/datasets/functions.py | 64 +---------------- tests/test_datasets/test_dataset_functions.py | 69 +++++++++---------- 2 files changed, 37 insertions(+), 96 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index bda02d419..0f3037a74 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -806,8 +806,6 @@ def edit_dataset( contributor=None, collection_date=None, language=None, - attributes=None, - data=None, default_target_attribute=None, ignore_attribute=None, citation=None, @@ -839,17 +837,6 @@ def edit_dataset( language : str Language in which the data is represented. Starts with 1 upper case letter, rest lower case, e.g. 'English'. - attributes : list, dict, or 'auto' - A list of tuples. Each tuple consists of the attribute name and type. - If passing a pandas DataFrame, the attributes can be automatically - inferred by passing ``'auto'``. Specific attributes can be manually - specified by a passing a dictionary where the key is the name of the - attribute and the value is the data type of the attribute. - data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features) - An array that contains both the attributes and the targets. When - providing a dataframe, the attribute names and type can be inferred by - passing ``attributes='auto'``. - The target feature is indicated as meta-data of the dataset. default_target_attribute : str The default target attribute, if it exists. Can have multiple values, comma separated. @@ -879,54 +866,6 @@ def edit_dataset( if not isinstance(data_id, int): raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id))) - # case 1, changing these fields creates a new version of the dataset with changed field - if any( - field is not None - for field in [ - data, - attributes, - default_target_attribute, - row_id_attribute, - ignore_attribute, - ] - ): - logger.warning("Creating a new version of dataset, cannot edit existing version") - - # Get old dataset and features - dataset = get_dataset(data_id) - df, y, categorical, attribute_names = dataset.get_data(dataset_format="dataframe") - attributes_old = attributes_arff_from_df(df) - - # Sparse data needs to be provided in a different format from dense data - if dataset.format == "sparse_arff": - df, y, categorical, attribute_names = dataset.get_data(dataset_format="array") - data_old = coo_matrix(df) - else: - data_old = df - data_new = data if data is not None else data_old - dataset_new = create_dataset( - name=dataset.name, - description=description or dataset.description, - creator=creator or dataset.creator, - contributor=contributor or dataset.contributor, - collection_date=collection_date or dataset.collection_date, - language=language or dataset.language, - licence=dataset.licence, - attributes=attributes or attributes_old, - data=data_new, - default_target_attribute=default_target_attribute or dataset.default_target_attribute, - ignore_attribute=ignore_attribute or dataset.ignore_attribute, - citation=citation or dataset.citation, - row_id_attribute=row_id_attribute or dataset.row_id_attribute, - original_data_url=original_data_url or dataset.original_data_url, - paper_url=paper_url or dataset.paper_url, - update_comment=dataset.update_comment, - version_label=dataset.version_label, - ) - dataset_new.publish() - return dataset_new.dataset_id - - # case 2, changing any of these fields will update existing dataset # compose data edit parameters as xml form_data = {"data_id": data_id} xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' @@ -937,6 +876,9 @@ def edit_dataset( xml["oml:data_edit_parameters"]["oml:contributor"] = contributor xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date xml["oml:data_edit_parameters"]["oml:language"] = language + xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute + xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute + xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute xml["oml:data_edit_parameters"]["oml:citation"] = citation xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index a3be7b2b7..957dadd7b 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1341,47 +1341,34 @@ def test_get_dataset_cache_format_feather(self): self.assertEqual(len(attribute_names), X.shape[1]) def test_data_edit(self): - - # admin key for test server (only admins or owners can edit datasets). - # all users can edit their own datasets) - openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3" - - # case 1, editing description, creator, contributor, collection_date, original_data_url, - # paper_url, citation, language edits existing dataset. + # Case 1 + # All users can edit non-critical fields of datasets + desc = "xor dataset representing XOR operation" did = 564 result = edit_dataset( did, - description="xor dataset represents XOR operation", - contributor="", + description=desc, + contributor="xxx", collection_date="2019-10-29 17:06:18", original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor", paper_url="", citation="kaggle", language="English", ) - self.assertEqual(result, did) - - # case 2, editing data, attributes, default_target_attribute, row_id_attribute, - # ignore_attribute generates a new dataset + self.assertEqual(did, result) + edited_dataset = openml.datasets.get_dataset(did) + self.assertEqual(edited_dataset.description, desc) - column_names = [ - ("input1", "REAL"), - ("input2", "REAL"), - ("y", "REAL"), - ] + # Case 2 + # only admins or owners can edit all critical fields of datasets + # admin key for test server + openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3" desc = "xor dataset represents XOR operation" - result = edit_dataset( - 564, - description=desc, - contributor="", - collection_date="2019-10-29 17:06:18", - attributes=column_names, - original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor", - paper_url="", - citation="kaggle", - language="English", - ) - self.assertNotEqual(did, result) + did = 565 + result = edit_dataset(did, default_target_attribute="y", ignore_attribute="input1") + self.assertEqual(did, result) + edited_dataset = openml.datasets.get_dataset(did) + self.assertEqual(edited_dataset.ignore_attribute, ["input1"]) def test_data_edit_errors(self): @@ -1390,8 +1377,10 @@ def test_data_edit_errors(self): # Check server exception when no field to edit is provided self.assertRaisesRegex( OpenMLServerException, - "Please provide atleast one field among description, creator, contributor, " - "collection_date, language, citation, original_data_url or paper_url to edit.", + "Please provide atleast one field among description, creator, " + "contributor, collection_date, language, citation, " + "original_data_url, default_target_attribute, row_id_attribute, " + "ignore_attribute or paper_url to edit.", edit_dataset, data_id=564, ) @@ -1403,12 +1392,22 @@ def test_data_edit_errors(self): data_id=100000, description="xor operation dataset", ) - # Check server exception when a non-owner or non-admin tries to edit existing dataset + # Check server exception when owner/admin edits critical features of dataset with tasks + self.assertRaisesRegex( + OpenMLServerException, + "Critical features default_target_attribute, row_id_attribute and ignore_attribute " + "can only be edited for datasets without any tasks.", + edit_dataset, + data_id=1, + default_target_attribute="y", + ) + # Check server exception when a non-owner or non-admin tries to edit critical features openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f" self.assertRaisesRegex( OpenMLServerException, - "Dataset is not owned by you", + "Critical features default_target_attribute, row_id_attribute and ignore_attribute " + "can be edited only by the owner. Fork the dataset if changes are required.", edit_dataset, data_id=564, - description="xor data", + default_target_attribute="y", ) From d1147b6e99da93eb352fe19376f34656b0363e1a Mon Sep 17 00:00:00 2001 From: sahithyaravi1493 Date: Thu, 27 Aug 2020 10:09:11 +0200 Subject: [PATCH 2/4] change test and example to reflect rest API changes --- examples/30_extended/datasets_tutorial.py | 36 +++++++++---------- tests/test_datasets/test_dataset_functions.py | 36 +++++++++---------- 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py index 40b35bbea..ed90424ab 100644 --- a/examples/30_extended/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -21,7 +21,7 @@ # # * Use the output_format parameter to select output type # * Default gives 'dict' (other option: 'dataframe', see below) - +# openml_list = openml.datasets.list_datasets() # returns a dict # Show a nice table with some key data properties @@ -117,15 +117,19 @@ # This example uses the test server, to avoid editing a dataset on the main server. openml.config.start_using_configuration_for_example() ############################################################################ -# Changes to these field edits existing version: allowed only for dataset owner +# Change the non-critical fields +desc = ( + "This data sets consists of 3 different types of irises' " + "(Setosa, Versicolour, and Virginica) petal and sepal length," + " stored in a 150x4 numpy.ndarray" +) +did = 128 data_id = edit_dataset( - 564, - description="xor dataset represents XOR operation", - contributor="", - collection_date="2019-10-29 17:06:18", - original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor", - paper_url="", - citation="kaggle", + did, + description=desc, + creator="R.A.Fisher", + collection_date="1937", + citation="The use of multiple measurements in taxonomic problems", language="English", ) edited_dataset = get_dataset(data_id) @@ -133,15 +137,11 @@ ############################################################################ -# Changes to these fields: attributes, default_target_attribute, -# row_id_attribute, ignore_attribute generates a new edited version: allowed for anyone - -new_attributes = [ - ("x0", "REAL"), - ("x1", "REAL"), - ("y", "REAL"), -] -data_id = edit_dataset(564, attributes=new_attributes) +# Changes to these fields: default_target_attribute, row_id_attribute, +# ignore_attribute can only be performed by owner +# To edit critical fields of a dataset owned by you, configure the API key: +# openml.config.apikey = 'FILL_IN_OPENML_API_KEY' +data_id = edit_dataset(564, default_target_attribute="y") print(f"Edited dataset ID: {data_id}") openml.config.stop_using_configuration_for_example() diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 957dadd7b..5076d06c2 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1343,16 +1343,18 @@ def test_get_dataset_cache_format_feather(self): def test_data_edit(self): # Case 1 # All users can edit non-critical fields of datasets - desc = "xor dataset representing XOR operation" - did = 564 + desc = ( + "This data sets consists of 3 different types of irises' " + "(Setosa, Versicolour, and Virginica) petal and sepal length," + " stored in a 150x4 numpy.ndarray" + ) + did = 128 result = edit_dataset( did, description=desc, - contributor="xxx", - collection_date="2019-10-29 17:06:18", - original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor", - paper_url="", - citation="kaggle", + creator="R.A.Fisher", + collection_date="1937", + citation="The use of multiple measurements in taxonomic problems", language="English", ) self.assertEqual(did, result) @@ -1360,20 +1362,15 @@ def test_data_edit(self): self.assertEqual(edited_dataset.description, desc) # Case 2 - # only admins or owners can edit all critical fields of datasets - # admin key for test server - openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3" - desc = "xor dataset represents XOR operation" - did = 565 - result = edit_dataset(did, default_target_attribute="y", ignore_attribute="input1") + # only owners (or admin) can edit all critical fields of datasets + # this is a dataset created by CI, so it is editable by this test + did = 315 + result = edit_dataset(did, default_target_attribute="col_1", ignore_attribute="col_2") self.assertEqual(did, result) edited_dataset = openml.datasets.get_dataset(did) - self.assertEqual(edited_dataset.ignore_attribute, ["input1"]) + self.assertEqual(edited_dataset.ignore_attribute, ["col_2"]) def test_data_edit_errors(self): - - # admin key for test server (only admins or owners can edit datasets). - openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3" # Check server exception when no field to edit is provided self.assertRaisesRegex( OpenMLServerException, @@ -1398,16 +1395,15 @@ def test_data_edit_errors(self): "Critical features default_target_attribute, row_id_attribute and ignore_attribute " "can only be edited for datasets without any tasks.", edit_dataset, - data_id=1, + data_id=223, default_target_attribute="y", ) # Check server exception when a non-owner or non-admin tries to edit critical features - openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f" self.assertRaisesRegex( OpenMLServerException, "Critical features default_target_attribute, row_id_attribute and ignore_attribute " "can be edited only by the owner. Fork the dataset if changes are required.", edit_dataset, - data_id=564, + data_id=128, default_target_attribute="y", ) From 17fb46bbb77f2e56f821eb06d309004fc89072ed Mon Sep 17 00:00:00 2001 From: sahithyaravi1493 Date: Fri, 28 Aug 2020 12:03:18 +0200 Subject: [PATCH 3/4] tutorial comments --- examples/30_extended/datasets_tutorial.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py index ed90424ab..e4c7e6888 100644 --- a/examples/30_extended/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -117,7 +117,9 @@ # This example uses the test server, to avoid editing a dataset on the main server. openml.config.start_using_configuration_for_example() ############################################################################ -# Change the non-critical fields +# Edit non-critical fields, allowed for all authorized users: +# description, creator, contributor, collection_date, language, citation, +# row_id_attribute, original_data_url,paper_url desc = ( "This data sets consists of 3 different types of irises' " "(Setosa, Versicolour, and Virginica) petal and sepal length," @@ -137,8 +139,8 @@ ############################################################################ -# Changes to these fields: default_target_attribute, row_id_attribute, -# ignore_attribute can only be performed by owner +# Edit critical fields, allowed only for owners of the dataset: +# default_target_attribute, row_id_attribute, ignore_attribute # To edit critical fields of a dataset owned by you, configure the API key: # openml.config.apikey = 'FILL_IN_OPENML_API_KEY' data_id = edit_dataset(564, default_target_attribute="y") From cfa25133af41c256916136d9de950aa4ff283035 Mon Sep 17 00:00:00 2001 From: Sahithya Ravi <44670788+sahithyaravi1493@users.noreply.github.com> Date: Fri, 28 Aug 2020 12:06:54 +0200 Subject: [PATCH 4/4] Update datasets_tutorial.py --- examples/30_extended/datasets_tutorial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py index e4c7e6888..e129b7718 100644 --- a/examples/30_extended/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -119,7 +119,7 @@ ############################################################################ # Edit non-critical fields, allowed for all authorized users: # description, creator, contributor, collection_date, language, citation, -# row_id_attribute, original_data_url,paper_url +# original_data_url, paper_url desc = ( "This data sets consists of 3 different types of irises' " "(Setosa, Versicolour, and Virginica) petal and sepal length,"