Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Edit api #935

Merged
merged 14 commits into from
Jul 23, 2020
2 changes: 1 addition & 1 deletion doc/progress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Changelog

0.11.0
~~~~~~

* ADD #929: Add data edit API
* FIX #873: Fixes an issue which resulted in incorrect URLs when printing OpenML objects after
switching the server.
* FIX #885: Logger no longer registered by default. Added utility functions to easily register
Expand Down
43 changes: 40 additions & 3 deletions examples/30_extended/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@

How to list and download datasets.
"""
############################################################################
""

# License: BSD 3-Clauses

import openml
import pandas as pd
from openml.datasets.functions import edit_dataset, get_dataset

############################################################################
# Exercise 0
Expand Down Expand Up @@ -42,9 +43,9 @@
# * Find a dataset called 'eeg_eye_state'.
# * Find all datasets with more than 50 classes.
datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20)
############################################################################
""
datalist.query('name == "eeg-eye-state"')
############################################################################
""
datalist.query("NumberOfClasses > 50")

############################################################################
Expand Down Expand Up @@ -108,3 +109,39 @@
alpha=0.8,
cmap="plasma",
)


############################################################################
# Edit a created dataset
# =================================================
# This example uses the test server, to avoid editing a dataset on the main server.
openml.config.start_using_configuration_for_example()
############################################################################
# Changes to these field edits existing version: allowed only for dataset owner
data_id = edit_dataset(
564,
description="xor dataset represents XOR operation",
contributor="",
collection_date="2019-10-29 17:06:18",
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
language="English",
)
edited_dataset = get_dataset(data_id)
print(f"Edited dataset ID: {data_id}")


############################################################################
# Changes to these fields: attributes, default_target_attribute,
sahithyaravi marked this conversation as resolved.
Show resolved Hide resolved
# row_id_attribute, ignore_attribute generates a new edited version: allowed for anyone

new_attributes = [
("x0", "REAL"),
("x1", "REAL"),
("y", "REAL"),
]
data_id = edit_dataset(564, attributes=new_attributes)
print(f"Edited dataset ID: {data_id}")

openml.config.stop_using_configuration_for_example()
148 changes: 148 additions & 0 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,154 @@ def status_update(data_id, status):
raise ValueError("Data id/status does not collide")


def edit_dataset(
data_id,
description=None,
creator=None,
contributor=None,
collection_date=None,
language=None,
attributes=None,
data=None,
default_target_attribute=None,
ignore_attribute=None,
citation=None,
row_id_attribute=None,
original_data_url=None,
paper_url=None,
) -> int:
"""
Edits an OpenMLDataset.
Specify atleast one field to edit, apart from data_id
- For certain fields, a new dataset version is created : attributes, data,
default_target_attribute, ignore_attribute, row_id_attribute.

- For other fields, the uploader can edit the exisiting version.
Noone except the uploader can edit the exisitng version.

Parameters
----------
data_id : int
ID of the dataset.
description : str
Description of the dataset.
creator : str
The person who created the dataset.
contributor : str
People who contributed to the current version of the dataset.
collection_date : str
The date the data was originally collected, given by the uploader.
language : str
Language in which the data is represented.
Starts with 1 upper case letter, rest lower case, e.g. 'English'.
attributes : list, dict, or 'auto'
A list of tuples. Each tuple consists of the attribute name and type.
If passing a pandas DataFrame, the attributes can be automatically
inferred by passing ``'auto'``. Specific attributes can be manually
specified by a passing a dictionary where the key is the name of the
attribute and the value is the data type of the attribute.
data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
An array that contains both the attributes and the targets. When
providing a dataframe, the attribute names and type can be inferred by
passing ``attributes='auto'``.
The target feature is indicated as meta-data of the dataset.
default_target_attribute : str
The default target attribute, if it exists.
Can have multiple values, comma separated.
ignore_attribute : str | list
Attributes that should be excluded in modelling,
such as identifiers and indexes.
citation : str
Reference(s) that should be cited when building on this data.
row_id_attribute : str, optional
The attribute that represents the row-id column, if present in the
dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
specified, the index of the dataframe will be used as the
``row_id_attribute``. If the name of the index is ``None``, it will
be discarded.

.. versionadded: 0.8
Inference of ``row_id_attribute`` from a dataframe.
original_data_url : str, optional
For derived data, the url to the original dataset.
paper_url : str, optional
Link to a paper describing the dataset.


Returns
-------
data_id of the existing edited version or the new version created and published"""
if not isinstance(data_id, int):
raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))

# case 1, changing these fields creates a new version of the dataset with changed field
if any(
field is not None
for field in [
data,
attributes,
default_target_attribute,
row_id_attribute,
ignore_attribute,
]
):
logger.warning("Creating a new version of dataset, cannot edit existing version")
dataset = get_dataset(data_id)

decoded_arff = dataset._get_arff(format="arff")
data_old = decoded_arff["data"]
data_new = data if data is not None else data_old
dataset_new = create_dataset(
name=dataset.name,
description=description or dataset.description,
creator=creator or dataset.creator,
contributor=contributor or dataset.contributor,
collection_date=collection_date or dataset.collection_date,
language=language or dataset.language,
licence=dataset.licence,
attributes=attributes or decoded_arff["attributes"],
data=data_new,
default_target_attribute=default_target_attribute or dataset.default_target_attribute,
ignore_attribute=ignore_attribute or dataset.ignore_attribute,
citation=citation or dataset.citation,
row_id_attribute=row_id_attribute or dataset.row_id_attribute,
original_data_url=original_data_url or dataset.original_data_url,
paper_url=paper_url or dataset.paper_url,
update_comment=dataset.update_comment,
version_label=dataset.version_label,
)
dataset_new.publish()
return dataset_new.dataset_id

# case 2, changing any of these fields will update existing dataset
# compose data edit parameters as xml
form_data = {"data_id": data_id}
xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]'
xml["oml:data_edit_parameters"] = OrderedDict()
xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
xml["oml:data_edit_parameters"]["oml:description"] = description
xml["oml:data_edit_parameters"]["oml:creator"] = creator
xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
xml["oml:data_edit_parameters"]["oml:language"] = language
xml["oml:data_edit_parameters"]["oml:citation"] = citation
xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url

# delete None inputs
for k in list(xml["oml:data_edit_parameters"]):
if not xml["oml:data_edit_parameters"][k]:
del xml["oml:data_edit_parameters"][k]

file_elements = {"edit_parameters": ("description.xml", xmltodict.unparse(xml))}
result_xml = openml._api_calls._perform_api_call(
"data/edit", "post", data=form_data, file_elements=file_elements
)
result = xmltodict.parse(result_xml)
data_id = result["oml:data_edit"]["oml:id"]
return int(data_id)

PGijsbers marked this conversation as resolved.
Show resolved Hide resolved

def _get_dataset_description(did_cache_dir, dataset_id):
"""Get the dataset description as xml dictionary.

Expand Down
81 changes: 80 additions & 1 deletion tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,17 @@

import openml
from openml import OpenMLDataset
from openml.exceptions import OpenMLCacheException, OpenMLHashException, OpenMLPrivateDatasetError
from openml.exceptions import (
OpenMLCacheException,
OpenMLHashException,
OpenMLPrivateDatasetError,
OpenMLServerException,
)
from openml.testing import TestBase
from openml.utils import _tag_entity, _create_cache_directory_for_id
from openml.datasets.functions import (
create_dataset,
edit_dataset,
attributes_arff_from_df,
_get_cached_dataset,
_get_cached_dataset_features,
Expand Down Expand Up @@ -1331,3 +1337,76 @@ def test_get_dataset_cache_format_feather(self):
self.assertEqual(X.shape, (150, 5))
self.assertEqual(len(categorical), X.shape[1])
self.assertEqual(len(attribute_names), X.shape[1])

def test_data_edit(self):

# admin key for test server (only admins or owners can edit datasets).
# all users can edit their own datasets)
openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"

# case 1, editing description, creator, contributor, collection_date, original_data_url,
# paper_url, citation, language edits existing dataset.
did = 564
result = edit_dataset(
did,
PGijsbers marked this conversation as resolved.
Show resolved Hide resolved
description="xor dataset represents XOR operation",
contributor="",
collection_date="2019-10-29 17:06:18",
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
language="English",
)
self.assertEqual(result, did)

# case 2, editing data, attributes, default_target_attribute, row_id_attribute,
# ignore_attribute generates a new dataset

column_names = [
("input1", "REAL"),
("input2", "REAL"),
("y", "REAL"),
]
desc = "xor dataset represents XOR operation"
result = edit_dataset(
564,
description=desc,
contributor="",
collection_date="2019-10-29 17:06:18",
attributes=column_names,
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
language="English",
)
self.assertNotEqual(did, result)

def test_data_edit_errors(self):

# admin key for test server (only admins or owners can edit datasets).
openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
# Check server exception when no field to edit is provided
self.assertRaisesRegex(
OpenMLServerException,
"Please provide atleast one field among description, creator, contributor, "
"collection_date, language, citation, original_data_url or paper_url to edit.",
edit_dataset,
data_id=564,
)
# Check server exception when unknown dataset is provided
self.assertRaisesRegex(
OpenMLServerException,
"Unknown dataset",
edit_dataset,
data_id=100000,
description="xor operation dataset",
)
# Check server exception when a non-owner or non-admin tries to edit existing dataset
openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f"
self.assertRaisesRegex(
OpenMLServerException,
"Dataset is not owned by you",
edit_dataset,
data_id=564,
description="xor data",
)