Skip to content

Commit

Permalink
Merge pull request #77 from WenjieDu/dev
Browse files Browse the repository at this point in the history
Fix loading UCR&UEA datasets in arff
  • Loading branch information
WenjieDu authored Aug 13, 2024
2 parents 77ead43 + ad4e1fb commit bb8d859
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 89 deletions.
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/dataset-addition.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ body:
description: |
Please note that if the dataset's open-source implementation is not available, it'll take much more time to finish the implementation, so we are less likely to implement it in `TSDB`.
options:
- label: "The dataset implementation is publicly available"
- label: "The dataset is publicly available"

- type: textarea
id: additional-info
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/greetings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
steps:
- uses: actions/first-interaction@v1
with:
repo-token: ${{ secrets.ACCESS_TOKEN }}
repo-token: ${{ secrets.GITHUB_TOKEN }}
issue-message: |
Hi there 👋,
Expand Down
1 change: 1 addition & 0 deletions tests/test_tsdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"pems_traffic",
"solar_alabama",
"ucr_uea_Wine",
"ucr_uea_FingerMovements",
]


Expand Down
3 changes: 2 additions & 1 deletion tsdb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
TSDB (Time Series Data Beans): a Python toolbox to ease loading public time-series datasets.
TSDB (Time Series Data Beans): a Python toolbox loads hundreds of public time-series datasets for machine/deep learning
with a single line of code.
"""

# Created by Wenjie Du <wenjay.du@gmail.com>
Expand Down
186 changes: 101 additions & 85 deletions tsdb/loading_funcs/ucr_uea_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,36 +14,29 @@
import os
import warnings

import numpy
import numpy as np
from sklearn.utils.estimator_checks import _NotAnArray as NotAnArray

try:
from scipy.io import arff

HAS_ARFF = True
except Exception:
HAS_ARFF = False


def load_ucr_uea_dataset(local_path, dataset_name):
try:
# if both TXT and ARFF files are provided, the TXT versions are
# used
# both training and test data must be available in the same format
if _has_files(local_path, dataset_name, ext="txt"):
X_train, y_train = _load_txt_uea(
os.path.join(local_path, dataset_name + "_TRAIN.txt")
)
X_test, y_test = _load_txt_uea(
os.path.join(local_path, dataset_name + "_TEST.txt")
)
elif _has_files(local_path, dataset_name, ext="arff"):
if _has_files(local_path, dataset_name, ext="arff"):
X_train, y_train = _load_arff_uea(
os.path.join(local_path, dataset_name + "_TRAIN.arff")
)
X_test, y_test = _load_arff_uea(
os.path.join(local_path, dataset_name + "_TEST.arff")
)
elif _has_files(local_path, dataset_name, ext="txt"):
X_train, y_train = _load_txt_uea(
os.path.join(local_path, dataset_name + "_TRAIN.txt")
)
X_test, y_test = _load_txt_uea(
os.path.join(local_path, dataset_name + "_TEST.txt")
)
else:
warnings.warn(
'dataset "%s" is not provided in either TXT '
Expand All @@ -55,9 +48,9 @@ def load_ucr_uea_dataset(local_path, dataset_name):

data = {
"X_train": X_train,
"y_train": y_train.astype(float),
"y_train": y_train,
"X_test": X_test,
"y_test": y_test.astype(float),
"y_test": y_test,
}

return data
Expand Down Expand Up @@ -115,24 +108,24 @@ def ts_size(ts):
Examples
--------
>>> ts_size([1, 2, 3, numpy.nan])
>>> ts_size([1, 2, 3, np.nan])
3
>>> ts_size([1, numpy.nan])
>>> ts_size([1, np.nan])
1
>>> ts_size([numpy.nan])
>>> ts_size([np.nan])
0
>>> ts_size([[1, 2],
... [2, 3],
... [3, 4],
... [numpy.nan, 2],
... [numpy.nan, numpy.nan]])
... [np.nan, 2],
... [np.nan, np.nan]])
4
>>> ts_size([numpy.nan, 3, numpy.inf, numpy.nan])
>>> ts_size([np.nan, 3, np.inf, np.nan])
3
"""
ts_ = to_time_series(ts)
sz = ts_.shape[0]
while sz > 0 and numpy.all(numpy.isnan(ts_[sz - 1])):
while sz > 0 and np.all(np.isnan(ts_[sz - 1])):
sz -= 1
return sz

Expand All @@ -151,7 +144,7 @@ def to_time_series(ts, remove_nans=False):
Returns
-------
numpy.ndarray of shape (sz, d)
np.ndarray of shape (sz, d)
The transformed time series. This is always guaraneteed to be a new
time series and never just a view into the old one.
Expand All @@ -160,19 +153,19 @@ def to_time_series(ts, remove_nans=False):
>>> to_time_series([1, 2])
array([[1.],
[2.]])
>>> to_time_series([1, 2, numpy.nan])
>>> to_time_series([1, 2, np.nan])
array([[ 1.],
[ 2.],
[nan]])
>>> to_time_series([1, 2, numpy.nan], remove_nans=True)
>>> to_time_series([1, 2, np.nan], remove_nans=True)
array([[1.],
[2.]])
See Also
--------
to_time_series_dataset : Transforms a dataset of time series
"""
ts_out = numpy.array(ts, copy=True)
ts_out = np.array(ts, copy=True)
if ts_out.ndim <= 1:
ts_out = ts_out.reshape((-1, 1))
if ts_out.dtype != float:
Expand All @@ -196,7 +189,7 @@ def to_time_series_dataset(dataset, dtype=float):
Returns
-------
numpy.ndarray of shape (n_ts, sz, d)
np.ndarray of shape (n_ts, sz, d)
The transformed dataset of time series.
Examples
Expand Down Expand Up @@ -226,79 +219,102 @@ def to_time_series_dataset(dataset, dtype=float):
import pandas as pd

if isinstance(dataset, pd.DataFrame):
return to_time_series_dataset(numpy.array(dataset))
return to_time_series_dataset(np.array(dataset))
except ImportError:
pass
if isinstance(dataset, NotAnArray): # Patch to pass sklearn tests
return to_time_series_dataset(numpy.array(dataset))
return to_time_series_dataset(np.array(dataset))
if len(dataset) == 0:
return numpy.zeros((0, 0, 0))
if numpy.array(dataset[0]).ndim == 0:
return np.zeros((0, 0, 0))
if np.array(dataset[0]).ndim == 0:
dataset = [dataset]
n_ts = len(dataset)
max_sz = max([ts_size(to_time_series(ts, remove_nans=True)) for ts in dataset])
d = to_time_series(dataset[0]).shape[1]
dataset_out = numpy.zeros((n_ts, max_sz, d), dtype=dtype) + numpy.nan
dataset_out = np.zeros((n_ts, max_sz, d), dtype=dtype) + np.nan
for i in range(n_ts):
ts = to_time_series(dataset[i], remove_nans=True)
dataset_out[i, : ts.shape[0]] = ts
return dataset_out.astype(dtype)


def _load_arff_uea(dataset_path):
"""Load arff file for uni/multi variate dataset
def _load_arff_uea(
full_file_path_and_name,
replace_missing_vals_with="NaN",
):
"""Load data from a classification/regression WEKA arff file to a 3D np array.
Parameters
----------
dataset_path: string of dataset_path
Path to the ARFF file to be read
full_file_path_and_name: str
The full pathname of the .ts file to read.
replace_missing_vals_with: str
The value that missing values in the text file should be replaced
with prior to parsing.
Returns
-------
x: numpy array of shape (n_timeseries, n_timestamps, n_features)
Time series dataset
y: numpy array of shape (n_timeseries, )
Vector of targets
Raises
------
ImportError: if the version of *Scipy* is too old (pre 1.3.0)
Exception: on any failure, e.g. if the given file does not exist or is
corrupted
data: np.ndarray
time series data, np.ndarray (n_cases, n_channels, n_timepoints)
y : np.ndarray of string or int
target variable
"""
if not HAS_ARFF:
raise ImportError(
"scipy 1.3.0 or newer is required to load "
"time series datasets from arff format."
)
data, meta = arff.loadarff(dataset_path)
names = meta.names() # ["input", "class"] for multi-variate

# firstly get y_train
y_ = data[names[-1]] # data["class"]
y = numpy.array(y_).astype("str")

# get x_train
if len(names) == 2: # len=2 => multi-variate
x_ = data[names[0]]
x_ = numpy.asarray(x_.tolist())

nb_example = x_.shape[0]
nb_channel = x_.shape[1]
length_one_channel = len(x_.dtype.descr)
x = numpy.empty([nb_example, length_one_channel, nb_channel])

for i in range(length_one_channel):
# x_.dtype.descr: [('t1', '<f8'), ('t2', '<f8'), ('t3', '<f8')]
time_stamp = x_.dtype.descr[i][0] # ["t1", "t2", "t3"]
x[:, i, :] = x_[time_stamp]

else: # uni-variate situation
x_ = data[names[:-1]]
x = numpy.asarray(x_.tolist(), dtype=numpy.float32)
x = x.reshape(len(x), -1, 1)

return x, y
instance_list = []
class_val_list = []
data_started = False
is_multi_variate = False
is_first_case = True
n_cases = 0
n_channels = 1
with open(full_file_path_and_name, encoding="utf-8") as f:
for line in f:
if line.strip():
if (
is_multi_variate is False
and "@attribute" in line.lower()
and "relational" in line.lower()
):
is_multi_variate = True

if "@data" in line.lower():
data_started = True
continue
# if the 'data tag has been found, the header information
# has been cleared and now data can be loaded
if data_started:
line = line.replace("?", replace_missing_vals_with)

if is_multi_variate:
line, class_val = line.split("',")
class_val_list.append(class_val.strip())
channels = line.split("\\n")
channels[0] = channels[0].replace("'", "")
if is_first_case:
n_channels = len(channels)
n_timepoints = len(channels[0].split(","))
is_first_case = False
elif len(channels) != n_channels:
raise ValueError(
f" Number of channels not equal in "
f"dataset, first case had {n_channels} channel "
f"but case number {n_cases + 1} has "
f"{len(channels)}"
)
inst = np.zeros(shape=(n_channels, n_timepoints))
for c in range(len(channels)):
split = channels[c].split(",")
inst[c] = np.array([float(i) for i in split])
else:
line_parts = line.split(",")
if is_first_case:
is_first_case = False
n_timepoints = len(line_parts) - 1
class_val_list.append(line_parts[-1].strip())
split = line_parts[: len(line_parts) - 1]
inst = np.zeros(shape=(n_channels, n_timepoints))
inst[0] = np.array([float(i) for i in split])
instance_list.append(inst)
return np.asarray(instance_list), np.asarray(class_val_list)


def _load_txt_uea(dataset_path):
Expand All @@ -311,17 +327,17 @@ def _load_txt_uea(dataset_path):
Returns
-------
x: numpy array of shape (n_timeseries, n_timestamps, n_features)
x: np array of shape (n_timeseries, n_timestamps, n_features)
Time series dataset
y: numpy array of shape (n_timeseries, )
y: np array of shape (n_timeseries, )
Vector of targets
Raises
------
Exception: on any failure, e.g. if the given file does not exist or is
corrupted
"""
data = numpy.loadtxt(dataset_path)
data = np.loadtxt(dataset_path)
X = to_time_series_dataset(data[:, 1:])
y = data[:, 0].astype(int)
return X, y
2 changes: 1 addition & 1 deletion tsdb/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
__version__ = "0.6.1"
__version__ = "0.6.2"

0 comments on commit bb8d859

Please sign in to comment.