Merge pull request #77 from WenjieDu/dev

Fix loading UCR&UEA datasets in arff
WenjieDu · Aug 13, 2024 · bb8d859 · bb8d859
2 parents 77ead43 + ad4e1fb
commit bb8d859
Show file tree

Hide file tree

Showing 6 changed files with 107 additions and 89 deletions.
diff --git a/.github/ISSUE_TEMPLATE/dataset-addition.yml b/.github/ISSUE_TEMPLATE/dataset-addition.yml
@@ -20,7 +20,7 @@ body:
       description: |
           Please note that if the dataset's open-source implementation is not available, it'll take much more time to finish the implementation, so we are less likely to implement it in `TSDB`.
       options:
-        - label: "The dataset implementation is publicly available"
+        - label: "The dataset is publicly available"
 
   - type: textarea
     id: additional-info

diff --git a/.github/workflows/greetings.yml b/.github/workflows/greetings.yml
@@ -18,7 +18,7 @@ jobs:
     steps:
     - uses: actions/first-interaction@v1
       with:
-        repo-token: ${{ secrets.ACCESS_TOKEN }}
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
         issue-message: |
           Hi there 👋,
 

diff --git a/tests/test_tsdb.py b/tests/test_tsdb.py
@@ -23,6 +23,7 @@
     "pems_traffic",
     "solar_alabama",
     "ucr_uea_Wine",
+    "ucr_uea_FingerMovements",
 ]
 
 

diff --git a/tsdb/__init__.py b/tsdb/__init__.py
@@ -1,5 +1,6 @@
 """
-TSDB (Time Series Data Beans): a Python toolbox to ease loading public time-series datasets.
+TSDB (Time Series Data Beans): a Python toolbox loads hundreds of public time-series datasets for machine/deep learning
+with a single line of code.
 """
 
 # Created by Wenjie Du <wenjay.du@gmail.com>

diff --git a/tsdb/loading_funcs/ucr_uea_datasets.py b/tsdb/loading_funcs/ucr_uea_datasets.py
@@ -14,36 +14,29 @@
 import os
 import warnings
 
-import numpy
+import numpy as np
 from sklearn.utils.estimator_checks import _NotAnArray as NotAnArray
 
-try:
-    from scipy.io import arff
-
-    HAS_ARFF = True
-except Exception:
-    HAS_ARFF = False
-
 
 def load_ucr_uea_dataset(local_path, dataset_name):
     try:
         # if both TXT and ARFF files are provided, the TXT versions are
         # used
         # both training and test data must be available in the same format
-        if _has_files(local_path, dataset_name, ext="txt"):
-            X_train, y_train = _load_txt_uea(
-                os.path.join(local_path, dataset_name + "_TRAIN.txt")
-            )
-            X_test, y_test = _load_txt_uea(
-                os.path.join(local_path, dataset_name + "_TEST.txt")
-            )
-        elif _has_files(local_path, dataset_name, ext="arff"):
+        if _has_files(local_path, dataset_name, ext="arff"):
             X_train, y_train = _load_arff_uea(
                 os.path.join(local_path, dataset_name + "_TRAIN.arff")
             )
             X_test, y_test = _load_arff_uea(
                 os.path.join(local_path, dataset_name + "_TEST.arff")
             )
+        elif _has_files(local_path, dataset_name, ext="txt"):
+            X_train, y_train = _load_txt_uea(
+                os.path.join(local_path, dataset_name + "_TRAIN.txt")
+            )
+            X_test, y_test = _load_txt_uea(
+                os.path.join(local_path, dataset_name + "_TEST.txt")
+            )
         else:
             warnings.warn(
                 'dataset "%s" is not provided in either TXT '
@@ -55,9 +48,9 @@ def load_ucr_uea_dataset(local_path, dataset_name):
 
         data = {
             "X_train": X_train,
-            "y_train": y_train.astype(float),
+            "y_train": y_train,
             "X_test": X_test,
-            "y_test": y_test.astype(float),
+            "y_test": y_test,
         }
 
         return data
@@ -115,24 +108,24 @@ def ts_size(ts):
 
     Examples
     --------
-    >>> ts_size([1, 2, 3, numpy.nan])
+    >>> ts_size([1, 2, 3, np.nan])
     3
-    >>> ts_size([1, numpy.nan])
+    >>> ts_size([1, np.nan])
     1
-    >>> ts_size([numpy.nan])
+    >>> ts_size([np.nan])
     0
     >>> ts_size([[1, 2],
     ...          [2, 3],
     ...          [3, 4],
-    ...          [numpy.nan, 2],
-    ...          [numpy.nan, numpy.nan]])
+    ...          [np.nan, 2],
+    ...          [np.nan, np.nan]])
     4
-    >>> ts_size([numpy.nan, 3, numpy.inf, numpy.nan])
+    >>> ts_size([np.nan, 3, np.inf, np.nan])
     3
     """
     ts_ = to_time_series(ts)
     sz = ts_.shape[0]
-    while sz > 0 and numpy.all(numpy.isnan(ts_[sz - 1])):
+    while sz > 0 and np.all(np.isnan(ts_[sz - 1])):
         sz -= 1
     return sz
 
@@ -151,7 +144,7 @@ def to_time_series(ts, remove_nans=False):
 
     Returns
     -------
-    numpy.ndarray of shape (sz, d)
+    np.ndarray of shape (sz, d)
         The transformed time series. This is always guaraneteed to be a new
         time series and never just a view into the old one.
 
@@ -160,19 +153,19 @@ def to_time_series(ts, remove_nans=False):
     >>> to_time_series([1, 2])
     array([[1.],
            [2.]])
-    >>> to_time_series([1, 2, numpy.nan])
+    >>> to_time_series([1, 2, np.nan])
     array([[ 1.],
            [ 2.],
            [nan]])
-    >>> to_time_series([1, 2, numpy.nan], remove_nans=True)
+    >>> to_time_series([1, 2, np.nan], remove_nans=True)
     array([[1.],
            [2.]])
 
     See Also
     --------
     to_time_series_dataset : Transforms a dataset of time series
     """
-    ts_out = numpy.array(ts, copy=True)
+    ts_out = np.array(ts, copy=True)
     if ts_out.ndim <= 1:
         ts_out = ts_out.reshape((-1, 1))
     if ts_out.dtype != float:
@@ -196,7 +189,7 @@ def to_time_series_dataset(dataset, dtype=float):
 
     Returns
     -------
-    numpy.ndarray of shape (n_ts, sz, d)
+    np.ndarray of shape (n_ts, sz, d)
         The transformed dataset of time series.
 
     Examples
@@ -226,79 +219,102 @@ def to_time_series_dataset(dataset, dtype=float):
         import pandas as pd
 
         if isinstance(dataset, pd.DataFrame):
-            return to_time_series_dataset(numpy.array(dataset))
+            return to_time_series_dataset(np.array(dataset))
     except ImportError:
         pass
     if isinstance(dataset, NotAnArray):  # Patch to pass sklearn tests
-        return to_time_series_dataset(numpy.array(dataset))
+        return to_time_series_dataset(np.array(dataset))
     if len(dataset) == 0:
-        return numpy.zeros((0, 0, 0))
-    if numpy.array(dataset[0]).ndim == 0:
+        return np.zeros((0, 0, 0))
+    if np.array(dataset[0]).ndim == 0:
         dataset = [dataset]
     n_ts = len(dataset)
     max_sz = max([ts_size(to_time_series(ts, remove_nans=True)) for ts in dataset])
     d = to_time_series(dataset[0]).shape[1]
-    dataset_out = numpy.zeros((n_ts, max_sz, d), dtype=dtype) + numpy.nan
+    dataset_out = np.zeros((n_ts, max_sz, d), dtype=dtype) + np.nan
     for i in range(n_ts):
         ts = to_time_series(dataset[i], remove_nans=True)
         dataset_out[i, : ts.shape[0]] = ts
     return dataset_out.astype(dtype)
 
 
-def _load_arff_uea(dataset_path):
-    """Load arff file for uni/multi variate dataset
+def _load_arff_uea(
+    full_file_path_and_name,
+    replace_missing_vals_with="NaN",
+):
+    """Load data from a classification/regression WEKA arff file to a 3D np array.
 
     Parameters
     ----------
-    dataset_path: string of dataset_path
-        Path to the ARFF file to be read
+    full_file_path_and_name: str
+        The full pathname of the .ts file to read.
+    replace_missing_vals_with: str
+       The value that missing values in the text file should be replaced
+       with prior to parsing.
 
     Returns
     -------
-    x: numpy array of shape (n_timeseries, n_timestamps, n_features)
-        Time series dataset
-    y: numpy array of shape (n_timeseries, )
-        Vector of targets
-
-    Raises
-    ------
-    ImportError: if the version of *Scipy* is too old (pre 1.3.0)
-    Exception: on any failure, e.g. if the given file does not exist or is
-               corrupted
+    data: np.ndarray
+        time series data, np.ndarray (n_cases, n_channels, n_timepoints)
+    y : np.ndarray of string or int
+        target variable
     """
-    if not HAS_ARFF:
-        raise ImportError(
-            "scipy 1.3.0 or newer is required to load "
-            "time series datasets from arff format."
-        )
-    data, meta = arff.loadarff(dataset_path)
-    names = meta.names()  # ["input", "class"] for multi-variate
-
-    # firstly get y_train
-    y_ = data[names[-1]]  # data["class"]
-    y = numpy.array(y_).astype("str")
-
-    # get x_train
-    if len(names) == 2:  # len=2 => multi-variate
-        x_ = data[names[0]]
-        x_ = numpy.asarray(x_.tolist())
-
-        nb_example = x_.shape[0]
-        nb_channel = x_.shape[1]
-        length_one_channel = len(x_.dtype.descr)
-        x = numpy.empty([nb_example, length_one_channel, nb_channel])
-
-        for i in range(length_one_channel):
-            # x_.dtype.descr: [('t1', '<f8'), ('t2', '<f8'), ('t3', '<f8')]
-            time_stamp = x_.dtype.descr[i][0]  # ["t1", "t2", "t3"]
-            x[:, i, :] = x_[time_stamp]
-
-    else:  # uni-variate situation
-        x_ = data[names[:-1]]
-        x = numpy.asarray(x_.tolist(), dtype=numpy.float32)
-        x = x.reshape(len(x), -1, 1)
-
-    return x, y
+    instance_list = []
+    class_val_list = []
+    data_started = False
+    is_multi_variate = False
+    is_first_case = True
+    n_cases = 0
+    n_channels = 1
+    with open(full_file_path_and_name, encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                if (
+                    is_multi_variate is False
+                    and "@attribute" in line.lower()
+                    and "relational" in line.lower()
+                ):
+                    is_multi_variate = True
+
+                if "@data" in line.lower():
+                    data_started = True
+                    continue
+                # if the 'data tag has been found, the header information
+                # has been cleared and now data can be loaded
+                if data_started:
+                    line = line.replace("?", replace_missing_vals_with)
+
+                    if is_multi_variate:
+                        line, class_val = line.split("',")
+                        class_val_list.append(class_val.strip())
+                        channels = line.split("\\n")
+                        channels[0] = channels[0].replace("'", "")
+                        if is_first_case:
+                            n_channels = len(channels)
+                            n_timepoints = len(channels[0].split(","))
+                            is_first_case = False
+                        elif len(channels) != n_channels:
+                            raise ValueError(
+                                f" Number of channels not equal in "
+                                f"dataset, first case had {n_channels} channel "
+                                f"but case number {n_cases + 1} has "
+                                f"{len(channels)}"
+                            )
+                        inst = np.zeros(shape=(n_channels, n_timepoints))
+                        for c in range(len(channels)):
+                            split = channels[c].split(",")
+                            inst[c] = np.array([float(i) for i in split])
+                    else:
+                        line_parts = line.split(",")
+                        if is_first_case:
+                            is_first_case = False
+                            n_timepoints = len(line_parts) - 1
+                        class_val_list.append(line_parts[-1].strip())
+                        split = line_parts[: len(line_parts) - 1]
+                        inst = np.zeros(shape=(n_channels, n_timepoints))
+                        inst[0] = np.array([float(i) for i in split])
+                    instance_list.append(inst)
+    return np.asarray(instance_list), np.asarray(class_val_list)
 
 
 def _load_txt_uea(dataset_path):
@@ -311,17 +327,17 @@ def _load_txt_uea(dataset_path):
 
     Returns
     -------
-    x: numpy array of shape (n_timeseries, n_timestamps, n_features)
+    x: np array of shape (n_timeseries, n_timestamps, n_features)
         Time series dataset
-    y: numpy array of shape (n_timeseries, )
+    y: np array of shape (n_timeseries, )
         Vector of targets
 
     Raises
     ------
     Exception: on any failure, e.g. if the given file does not exist or is
                corrupted
     """
-    data = numpy.loadtxt(dataset_path)
+    data = np.loadtxt(dataset_path)
     X = to_time_series_dataset(data[:, 1:])
     y = data[:, 0].astype(int)
     return X, y
diff --git a/tsdb/version.py b/tsdb/version.py
@@ -21,4 +21,4 @@
 #
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
-__version__ = "0.6.1"
+__version__ = "0.6.2"