Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Edit api #935

Merged
merged 14 commits into from
Jul 23, 2020
148 changes: 148 additions & 0 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,154 @@ def status_update(data_id, status):
raise ValueError("Data id/status does not collide")


def edit_dataset(
data_id,
description=None,
creator=None,
contributor=None,
collection_date=None,
language=None,
attributes=None,
data=None,
default_target_attribute=None,
ignore_attribute=None,
citation=None,
row_id_attribute=None,
original_data_url=None,
paper_url=None,
):
sahithyaravi marked this conversation as resolved.
Show resolved Hide resolved
"""
Edits an OpenMLDataset.
Specify atleast one field to edit, apart from data_id
- For certain fields, a new dataset version is created : attributes, data,
default_target_attribute, ignore_attribute, row_id_attribute.

- For other fields, the uploader can edit the exisiting version.
Noone except the uploader can edit the exisitng version.

Parameters
----------
data_id : int
ID of the dataset.
description : str
Description of the dataset.
creator : str
The person who created the dataset.
contributor : str
People who contributed to the current version of the dataset.
collection_date : str
The date the data was originally collected, given by the uploader.
language : str
Language in which the data is represented.
Starts with 1 upper case letter, rest lower case, e.g. 'English'.
attributes : list, dict, or 'auto'
A list of tuples. Each tuple consists of the attribute name and type.
If passing a pandas DataFrame, the attributes can be automatically
inferred by passing ``'auto'``. Specific attributes can be manually
specified by a passing a dictionary where the key is the name of the
attribute and the value is the data type of the attribute.
data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
An array that contains both the attributes and the targets. When
providing a dataframe, the attribute names and type can be inferred by
passing ``attributes='auto'``.
The target feature is indicated as meta-data of the dataset.
default_target_attribute : str
The default target attribute, if it exists.
Can have multiple values, comma separated.
ignore_attribute : str | list
Attributes that should be excluded in modelling,
such as identifiers and indexes.
citation : str
Reference(s) that should be cited when building on this data.
row_id_attribute : str, optional
The attribute that represents the row-id column, if present in the
dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
specified, the index of the dataframe will be used as the
``row_id_attribute``. If the name of the index is ``None``, it will
be discarded.

.. versionadded: 0.8
Inference of ``row_id_attribute`` from a dataframe.
original_data_url : str, optional
For derived data, the url to the original dataset.
paper_url : str, optional
Link to a paper describing the dataset.


Returns
-------
data_id of the existing edited version or the new version created and published"""
if not isinstance(data_id, int):
raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))

# case 1, changing these fields creates a new version of the dataset with changed field
if any(
field is not None
for field in [
data,
attributes,
default_target_attribute,
row_id_attribute,
ignore_attribute,
]
):
logger.warning("Creating a new version of dataset, cannot edit existing version")
dataset = get_dataset(data_id)

decoded_arff = dataset._get_arff(format="arff")
data_old = decoded_arff["data"]
data_new = data if data is not None else data_old
dataset_new = create_dataset(
name=dataset.name,
description=description or dataset.description,
creator=creator or dataset.creator,
contributor=contributor or dataset.contributor,
collection_date=collection_date or dataset.collection_date,
language=language or dataset.language,
licence=dataset.licence,
attributes=attributes or decoded_arff["attributes"],
data=data_new,
default_target_attribute=default_target_attribute or dataset.default_target_attribute,
ignore_attribute=ignore_attribute or dataset.ignore_attribute,
citation=citation or dataset.citation,
row_id_attribute=row_id_attribute or dataset.row_id_attribute,
original_data_url=original_data_url or dataset.original_data_url,
paper_url=paper_url or dataset.paper_url,
update_comment=dataset.update_comment,
version_label=dataset.version_label,
)
dataset_new.publish()
return dataset_new.dataset_id

# case 2, changing any of these fields will update existing dataset
# compose data edit parameters as xml
form_data = {"data_id": data_id}
xml = OrderedDict()
sahithyaravi marked this conversation as resolved.
Show resolved Hide resolved
xml["oml:data_edit_parameters"] = OrderedDict()
xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
xml["oml:data_edit_parameters"]["oml:description"] = description
xml["oml:data_edit_parameters"]["oml:creator"] = creator
xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
xml["oml:data_edit_parameters"]["oml:language"] = language
xml["oml:data_edit_parameters"]["oml:citation"] = citation
xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url

# delete None inputs
for k in list(xml["oml:data_edit_parameters"]):
if not xml["oml:data_edit_parameters"][k]:
del xml["oml:data_edit_parameters"][k]

file_elements = {"edit_parameters": ("description.xml", xmltodict.unparse(xml))}
result_xml = openml._api_calls._perform_api_call(
"data/edit", "post", data=form_data, file_elements=file_elements
)
result = xmltodict.parse(result_xml)
data_id = result["oml:data_edit"]["oml:id"]
return int(data_id)

PGijsbers marked this conversation as resolved.
Show resolved Hide resolved

def _get_dataset_description(did_cache_dir, dataset_id):
"""Get the dataset description as xml dictionary.

Expand Down
42 changes: 42 additions & 0 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from openml.utils import _tag_entity, _create_cache_directory_for_id
from openml.datasets.functions import (
create_dataset,
edit_dataset,
attributes_arff_from_df,
_get_cached_dataset,
_get_cached_dataset_features,
Expand Down Expand Up @@ -1331,3 +1332,44 @@ def test_get_dataset_cache_format_feather(self):
self.assertEqual(X.shape, (150, 5))
self.assertEqual(len(categorical), X.shape[1])
self.assertEqual(len(attribute_names), X.shape[1])

def test_data_edit(self):

# admin key for test server (only admins or owners can edit datasets).
# all users can edit their own datasets)
openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"

# case 1, existing version edit
did = 564
result = edit_dataset(
did,
PGijsbers marked this conversation as resolved.
Show resolved Hide resolved
description="xor dataset represents XOR operation",
contributor="",
collection_date="2019-10-29 17:06:18",
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
language="English",
)
self.assertEqual(result, did)

# case 2, new version edit

column_names = [
("input1", "REAL"),
("input2", "REAL"),
("y", "REAL"),
]
desc = "xor dataset represents XOR operation"
result = edit_dataset(
564,
description=desc,
contributor="",
collection_date="2019-10-29 17:06:18",
attributes=column_names,
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
language="English",
)
self.assertNotEqual(did, result)