From efff04ddf85fc18303e819186eb68ac198ab48d8 Mon Sep 17 00:00:00 2001 From: Alexander Nikitin <1243786+AlexanderVNikitin@users.noreply.github.com> Date: Tue, 4 Jun 2024 22:28:49 +0300 Subject: [PATCH] generalize dataloader --- tests/test_downstream_models.py | 2 +- tests/test_utils.py | 26 +++++++-------- tsgm/utils/datasets.py | 56 ++++++++++----------------------- 3 files changed, 30 insertions(+), 54 deletions(-) diff --git a/tests/test_downstream_models.py b/tests/test_downstream_models.py index 7556baa..d34e418 100644 --- a/tests/test_downstream_models.py +++ b/tests/test_downstream_models.py @@ -7,7 +7,7 @@ def _get_gunpoint_dataset(): - data_manager = tsgm.utils.UCRDataManager(ds="gunpoint") + data_manager = tsgm.utils.UCRDataManager(ds="GunPoint") X_train, y_train, X_test, y_test = data_manager.get() X_train, X_test = X_train[:, :, None], X_test[:, :, None] y_train = keras.utils.to_categorical(y_train - 1) diff --git a/tests/test_utils.py b/tests/test_utils.py index bdc0562..6c848f6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -65,7 +65,7 @@ def test_switch_generator(): def test_ucr_manager(): - DATASET = "gunpoint" + DATASET = "GunPoint" ucr_data_manager = tsgm.utils.UCRDataManager(ds=DATASET) assert ucr_data_manager.summary() is None X_train, y_train, X_test, y_test = ucr_data_manager.get() @@ -189,7 +189,7 @@ def test_mmd_3_test(): Y = np.random.normal(10, 100, 100)[:, None] Z = np.random.normal(0, 1, 100)[:, None] - # Use custome kernels with this (TF-sklearn compatibility) + # Use custom kernels with this (TF-sklearn compatibility) # sigma_XY = tsgm.utils.kernel_median_heuristic(X, Y); # sigma_XZ = tsgm.utils.kernel_median_heuristic(X, Z); # sigma = (sigma_XY + sigma_XZ) / 2 @@ -201,16 +201,14 @@ def test_mmd_3_test(): @pytest.mark.parametrize("dataset_name", [ - "beef", - "coffee", - "ecg200", - "electric", - "freezer", - "gunpoint", - "insect", - "mixed_shapes", - "starlight", - "wafer" + "Beef", + "Coffee", + "ECG200", + "ElectricDevices", + "GunPoint", + "MixedShapesRegularTrain", + "StarLightCurves", + "Wafer" ]) def test_ucr_loadable(dataset_name): ucr_data_manager = tsgm.utils.UCRDataManager(ds=dataset_name) @@ -222,11 +220,11 @@ def test_ucr_loadable(dataset_name): def test_ucr_raises(): with pytest.raises(ValueError) as excinfo: ucr_data_manager = tsgm.utils.UCRDataManager(ds="does not exist") - assert "ds should be in" in str(excinfo.value) + assert "ds should be listed at UCR website" in str(excinfo.value) def test_get_wafer(): - dataset = "wafer" + dataset = "Wafer" ucr_data_manager = tsgm.utils.UCRDataManager(ds=dataset) assert ucr_data_manager.summary() is None X_train, y_train, X_test, y_test = ucr_data_manager.get() diff --git a/tsgm/utils/datasets.py b/tsgm/utils/datasets.py index 37ef247..0f47639 100644 --- a/tsgm/utils/datasets.py +++ b/tsgm/utils/datasets.py @@ -130,6 +130,16 @@ def gen_sine_vs_const_dataset(N: int, T: int, D: int, max_value: int = 10, const class UCRDataManager: """ A manager for UCR collection of time series datasets. + If you find these datasets useful, please cite: + @misc{UCRArchive2018, + title = {The UCR Time Series Classification Archive}, + author = {Dau, Hoang Anh and Keogh, Eamonn and Kamgar, Kaveh and Yeh, Chin-Chia Michael and Zhu, Yan + and Gharghabi, Shaghayegh and Ratanamahatana, Chotirat Ann and Yanping and Hu, Bing + and Begum, Nurjahan and Bagnall, Anthony and Mueen, Abdullah and Batista, Gustavo, and Hexagon-ML}, + year = {2018}, + month = {October}, + note = {\\url{https://www.cs.ucr.edu/~eamonn/time_series_data_2018/}} + } """ mirrors = ["https://www.cs.ucr.edu/~eamonn/time_series_data_2018/"] resources = [("UCRArchive_2018.zip", 0)] @@ -140,7 +150,7 @@ def __init__(self, path: str = default_path, ds: str = "gunpoint") -> None: """ :param path: a relative path to the stored UCR dataset. :type path: str - :param ds: Name of the dataset. Should be in (beef | coffee | ecg200 | freezer | gunpoint | insect | mixed_shapes | starlight). + :param ds: Name of the dataset. The list of names is available at https://www.cs.ucr.edu/~eamonn/time_series_data_2018/ (case sensitive!). :type ds: str :raises ValueError: When there is no stored UCR archive, or the name of the dataset is incorrect. @@ -150,48 +160,16 @@ def __init__(self, path: str = default_path, ds: str = "gunpoint") -> None: self.ds = ds.strip().lower() self.y_all: T.Optional[T.Collection[T.Hashable]] = None + path = os.path.join(path, ds) + train_files = glob.glob(os.path.join(path, "*TRAIN.tsv")) - if ds == "beef": - self.regular_train_path = os.path.join(path, "Beef") - self.small_train_path = os.path.join(path, "Beef") - elif ds == "coffee": - self.regular_train_path = os.path.join(path, "Coffee") - self.small_train_path = os.path.join(path, "Coffee") - elif ds == "ecg200": - self.regular_train_path = os.path.join(path, "ECG200") - self.small_train_path = os.path.join(path, "ECG200") - elif ds == "electric": - self.regular_train_path = os.path.join(path, "ElectricDevices") - self.small_train_path = os.path.join(path, "ElectricDevices") - elif ds == "freezer": - self.regular_train_path = os.path.join(path, "FreezerRegularTrain") - self.small_train_path = os.path.join(path, "FreezerSmallTrain") - elif ds == "gunpoint": - self.regular_train_path = os.path.join(path, "GunPoint") - self.small_train_path = os.path.join(path, "GunPoint") - elif ds == "insect": - self.regular_train_path = os.path.join(path, "InsectEPGRegularTrain") - self.small_train_path = os.path.join(path, path, "InsectEPGSmallTrain") - elif ds == "mixed_shapes": - self.regular_train_path = os.path.join(path, path, "MixedShapesRegularTrain") - self.small_train_path = os.path.join(path, path, "MixedShapesSmallTrain") - elif ds == "starlight": - self.regular_train_path = os.path.join(path, path, "StarLightCurves") - self.small_train_path = os.path.join(path, path, "StarLightCurves") - elif ds == "wafer": - self.regular_train_path = os.path.join(path, path, "Wafer") - self.small_train_path = os.path.join(path, path, "Wafer") - else: - raise ValueError("ds should be in (beef | coffee | ecg200 | freezer | gunpoint | insect | mixed_shapes | starlight)") - - self.small_train_df = pd.read_csv( - glob.glob(os.path.join(self.small_train_path, "*TRAIN.tsv"))[0], - sep='\t', header=None) + if len(train_files) == 0: + raise ValueError("ds should be listed at UCR website") self.train_df = pd.read_csv( - glob.glob(os.path.join(self.regular_train_path, "*TRAIN.tsv"))[0], + glob.glob(os.path.join(path, "*TRAIN.tsv"))[0], sep='\t', header=None) self.test_df = pd.read_csv( - glob.glob(os.path.join(self.regular_train_path, "*TEST.tsv"))[0], + glob.glob(os.path.join(path, "*TEST.tsv"))[0], sep='\t', header=None) self.X_train, self.y_train = self.train_df[self.train_df.columns[1:]].to_numpy(), self.train_df[self.train_df.columns[0]].to_numpy()