Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change edit_api to reflect server #941

Merged
merged 4 commits into from
Aug 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 20 additions & 18 deletions examples/30_extended/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#
# * Use the output_format parameter to select output type
# * Default gives 'dict' (other option: 'dataframe', see below)

#
openml_list = openml.datasets.list_datasets() # returns a dict

# Show a nice table with some key data properties
Expand Down Expand Up @@ -117,31 +117,33 @@
# This example uses the test server, to avoid editing a dataset on the main server.
openml.config.start_using_configuration_for_example()
############################################################################
# Changes to these field edits existing version: allowed only for dataset owner
# Edit non-critical fields, allowed for all authorized users:
# description, creator, contributor, collection_date, language, citation,
# original_data_url, paper_url
desc = (
"This data sets consists of 3 different types of irises' "
"(Setosa, Versicolour, and Virginica) petal and sepal length,"
" stored in a 150x4 numpy.ndarray"
)
did = 128
data_id = edit_dataset(
564,
description="xor dataset represents XOR operation",
contributor="",
collection_date="2019-10-29 17:06:18",
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
did,
description=desc,
creator="R.A.Fisher",
collection_date="1937",
citation="The use of multiple measurements in taxonomic problems",
language="English",
)
edited_dataset = get_dataset(data_id)
print(f"Edited dataset ID: {data_id}")


############################################################################
# Changes to these fields: attributes, default_target_attribute,
# row_id_attribute, ignore_attribute generates a new edited version: allowed for anyone

new_attributes = [
("x0", "REAL"),
("x1", "REAL"),
("y", "REAL"),
]
data_id = edit_dataset(564, attributes=new_attributes)
# Edit critical fields, allowed only for owners of the dataset:
# default_target_attribute, row_id_attribute, ignore_attribute
# To edit critical fields of a dataset owned by you, configure the API key:
# openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
data_id = edit_dataset(564, default_target_attribute="y")
print(f"Edited dataset ID: {data_id}")

openml.config.stop_using_configuration_for_example()
64 changes: 3 additions & 61 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,8 +806,6 @@ def edit_dataset(
contributor=None,
collection_date=None,
language=None,
attributes=None,
data=None,
default_target_attribute=None,
ignore_attribute=None,
citation=None,
Expand Down Expand Up @@ -839,17 +837,6 @@ def edit_dataset(
language : str
Language in which the data is represented.
Starts with 1 upper case letter, rest lower case, e.g. 'English'.
attributes : list, dict, or 'auto'
A list of tuples. Each tuple consists of the attribute name and type.
If passing a pandas DataFrame, the attributes can be automatically
inferred by passing ``'auto'``. Specific attributes can be manually
specified by a passing a dictionary where the key is the name of the
attribute and the value is the data type of the attribute.
data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
An array that contains both the attributes and the targets. When
providing a dataframe, the attribute names and type can be inferred by
passing ``attributes='auto'``.
The target feature is indicated as meta-data of the dataset.
default_target_attribute : str
The default target attribute, if it exists.
Can have multiple values, comma separated.
Expand Down Expand Up @@ -879,54 +866,6 @@ def edit_dataset(
if not isinstance(data_id, int):
raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))

# case 1, changing these fields creates a new version of the dataset with changed field
if any(
field is not None
for field in [
data,
attributes,
default_target_attribute,
row_id_attribute,
ignore_attribute,
]
):
logger.warning("Creating a new version of dataset, cannot edit existing version")

# Get old dataset and features
dataset = get_dataset(data_id)
df, y, categorical, attribute_names = dataset.get_data(dataset_format="dataframe")
attributes_old = attributes_arff_from_df(df)

# Sparse data needs to be provided in a different format from dense data
if dataset.format == "sparse_arff":
df, y, categorical, attribute_names = dataset.get_data(dataset_format="array")
data_old = coo_matrix(df)
else:
data_old = df
data_new = data if data is not None else data_old
dataset_new = create_dataset(
name=dataset.name,
description=description or dataset.description,
creator=creator or dataset.creator,
contributor=contributor or dataset.contributor,
collection_date=collection_date or dataset.collection_date,
language=language or dataset.language,
licence=dataset.licence,
attributes=attributes or attributes_old,
data=data_new,
default_target_attribute=default_target_attribute or dataset.default_target_attribute,
ignore_attribute=ignore_attribute or dataset.ignore_attribute,
citation=citation or dataset.citation,
row_id_attribute=row_id_attribute or dataset.row_id_attribute,
original_data_url=original_data_url or dataset.original_data_url,
paper_url=paper_url or dataset.paper_url,
update_comment=dataset.update_comment,
version_label=dataset.version_label,
)
dataset_new.publish()
return dataset_new.dataset_id

# case 2, changing any of these fields will update existing dataset
# compose data edit parameters as xml
form_data = {"data_id": data_id}
xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]'
Expand All @@ -937,6 +876,9 @@ def edit_dataset(
xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
xml["oml:data_edit_parameters"]["oml:language"] = language
xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute
xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute
xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute
xml["oml:data_edit_parameters"]["oml:citation"] = citation
xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
Expand Down
87 changes: 41 additions & 46 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1341,57 +1341,43 @@ def test_get_dataset_cache_format_feather(self):
self.assertEqual(len(attribute_names), X.shape[1])

def test_data_edit(self):

# admin key for test server (only admins or owners can edit datasets).
# all users can edit their own datasets)
openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"

# case 1, editing description, creator, contributor, collection_date, original_data_url,
# paper_url, citation, language edits existing dataset.
did = 564
result = edit_dataset(
did,
description="xor dataset represents XOR operation",
contributor="",
collection_date="2019-10-29 17:06:18",
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
language="English",
# Case 1
# All users can edit non-critical fields of datasets
desc = (
"This data sets consists of 3 different types of irises' "
"(Setosa, Versicolour, and Virginica) petal and sepal length,"
" stored in a 150x4 numpy.ndarray"
)
self.assertEqual(result, did)

# case 2, editing data, attributes, default_target_attribute, row_id_attribute,
# ignore_attribute generates a new dataset

column_names = [
("input1", "REAL"),
("input2", "REAL"),
("y", "REAL"),
]
desc = "xor dataset represents XOR operation"
did = 128
result = edit_dataset(
564,
did,
description=desc,
contributor="",
collection_date="2019-10-29 17:06:18",
attributes=column_names,
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
creator="R.A.Fisher",
collection_date="1937",
citation="The use of multiple measurements in taxonomic problems",
language="English",
)
self.assertNotEqual(did, result)
self.assertEqual(did, result)
edited_dataset = openml.datasets.get_dataset(did)
self.assertEqual(edited_dataset.description, desc)

# Case 2
# only owners (or admin) can edit all critical fields of datasets
# this is a dataset created by CI, so it is editable by this test
did = 315
result = edit_dataset(did, default_target_attribute="col_1", ignore_attribute="col_2")
self.assertEqual(did, result)
edited_dataset = openml.datasets.get_dataset(did)
self.assertEqual(edited_dataset.ignore_attribute, ["col_2"])

def test_data_edit_errors(self):

# admin key for test server (only admins or owners can edit datasets).
openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
# Check server exception when no field to edit is provided
self.assertRaisesRegex(
OpenMLServerException,
"Please provide atleast one field among description, creator, contributor, "
"collection_date, language, citation, original_data_url or paper_url to edit.",
"Please provide atleast one field among description, creator, "
"contributor, collection_date, language, citation, "
"original_data_url, default_target_attribute, row_id_attribute, "
"ignore_attribute or paper_url to edit.",
edit_dataset,
data_id=564,
)
Expand All @@ -1403,12 +1389,21 @@ def test_data_edit_errors(self):
data_id=100000,
description="xor operation dataset",
)
# Check server exception when a non-owner or non-admin tries to edit existing dataset
openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f"
# Check server exception when owner/admin edits critical features of dataset with tasks
self.assertRaisesRegex(
OpenMLServerException,
"Dataset is not owned by you",
"Critical features default_target_attribute, row_id_attribute and ignore_attribute "
"can only be edited for datasets without any tasks.",
edit_dataset,
data_id=564,
description="xor data",
data_id=223,
default_target_attribute="y",
)
# Check server exception when a non-owner or non-admin tries to edit critical features
self.assertRaisesRegex(
OpenMLServerException,
"Critical features default_target_attribute, row_id_attribute and ignore_attribute "
"can be edited only by the owner. Fork the dataset if changes are required.",
edit_dataset,
data_id=128,
default_target_attribute="y",
)