Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First version of Physionet. #454

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions skfda/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"fetch_nox",
"fetch_octane",
"fetch_phoneme",
"fetch_physionet",
"fetch_tecator",
"fetch_ucr",
"fetch_weather",
Expand Down Expand Up @@ -45,6 +46,7 @@
fetch_nox as fetch_nox,
fetch_octane as fetch_octane,
fetch_phoneme as fetch_phoneme,
fetch_physionet as fetch_physionet,
fetch_tecator as fetch_tecator,
fetch_ucr as fetch_ucr,
fetch_weather as fetch_weather,
Expand Down
121 changes: 121 additions & 0 deletions skfda/datasets/_real_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing_extensions import Literal

import rdata
import skdatasets

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
F401 'skdatasets' imported but unused

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
I001 isort found an import in the wrong position


from ..representation import FDataGrid
from ..typing._numpy import NDArrayFloat, NDArrayInt
Expand Down Expand Up @@ -213,6 +214,126 @@ def fetch_ucr(
return dataset


def _physionet_to_fdatagrid(
name: str,
data: DataFrame,
mode: Literal[

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
WPS320 Found multi-line function type annotation

None,
"pad_left",
"pad_right",
"truncate_left",
"truncate_right",
],
) -> FDataGrid:

column = data.loc[:, "signal"]
n_samples = len(column)
dim_codomain = column[0].shape[1]

min_len = min(s.shape[0] for s in column)
max_len = max(s.shape[0] for s in column)

if mode is None and min_len != max_len:
raise ValueError(
f"Dataset {name} has signals of different lengths. Use the "
f"'mode' parameter to set a common lenght",
)

n_points = max_len if mode in {"pad_left", "pad_right"} else min_len

data_matrix = np.full(
shape=(n_samples, n_points, dim_codomain),
fill_value=np.nan,
dtype=column[0].dtype,
)

for i, sample in enumerate(column):
copy_len = min(sample.shape[0], n_points)

if mode in {None, "pad_right", "truncate_right"}:
data_matrix[i, :copy_len, :] = sample[:copy_len, :]
else:
data_matrix[i, -copy_len:, :] = sample[-copy_len:, :]

grid_points = np.linspace(
0,
column.attrs["fs"] * (n_points - 1),
n_points,
)

coordinate_names = [
f"{sig_name}({unit})"
for sig_name, unit in zip(
column.attrs["sig_name"],
column.attrs["units"],
)
]

sample_names = list(data.index)

return FDataGrid(
data_matrix=data_matrix,
grid_points=grid_points,
dataset_name=name,
coordinate_names=coordinate_names,
sample_names=sample_names,
)


def fetch_physionet(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
WPS320 Found multi-line function type annotation

name: str,
*,
return_X_y: bool = False,
as_frame: bool = True,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
DAR101 Missing parameter(s) in Docstring: - as_frame

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
DAR101 Missing parameter(s) in Docstring: - mode

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
DAR101 Missing parameter(s) in Docstring: - return_X_y

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
DAR101 Missing parameter(s) in Docstring: - target_column

target_column: str | Sequence[str] | None = None,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
F821 undefined name 'Sequence'

mode: Literal[

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
WPS320 Found multi-line function type annotation

None,
"pad_left",
"pad_right",
"truncate_left",
"truncate_right",
] = None,
**kwargs: Any,
) -> (
Bunch
| Tuple[NDArrayAny, NDArrayAny | None]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
F821 undefined name 'NDArrayAny'

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
F821 undefined name 'NDArrayAny'

| Tuple[DataFrame, Series | DataFrame | None]
):
"""
Fetch a dataset from Physionet.

Args:
name: Dataset name.
kwargs: Additional parameters for the function
:func:`skdatasets.repositories.ucr.fetch`.

Returns:
The dataset requested.

Examples:
>>> import skfda
>>> X, y = skfda.datasets.fetch_physionet("ctu-uhb-ctgdb", return_X_y=True, mode="truncate_right")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
E501 line too long (106 > 79 characters)


"""
repositories = _get_skdatasets_repositories()

dataset = repositories.physionet.fetch(name, as_frame=True, **kwargs)

fdatagrid = _physionet_to_fdatagrid(name, data=dataset.frame, mode=mode)

dataset.frame.loc[:, "signal"] = pd.Series(
fdatagrid,
index=dataset.frame.index,
)

return repositories.base.dataset_from_dataframe(
dataset.frame,
return_X_y=return_X_y,
as_frame=as_frame,
target_column=target_column,
)


def _fetch_cran_no_encoding_warning(*args: Any, **kwargs: Any) -> Any:
# Probably non thread safe
with warnings.catch_warnings():
Expand Down
1 change: 1 addition & 0 deletions skfda/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
__name__,
submodules=[
"feature_construction",
"missing",
"registration",
"smoothing",
"dim_reduction",
Expand Down
1 change: 1 addition & 0 deletions skfda/preprocessing/missing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ._interpolate import MissingValuesInterpolation

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
D104 Missing docstring in public package

80 changes: 80 additions & 0 deletions skfda/preprocessing/missing/_interpolate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from typing import Any, TypeVar

import numpy as np
from scipy.interpolate import InterpolatedUnivariateSpline
from scipy.interpolate.interpnd import LinearNDInterpolator

from ..._utils._sklearn_adapter import BaseEstimator, InductiveTransformerMixin
from ...representation import FDataGrid
from ...typing._base import GridPoints
from ...typing._numpy import NDArrayFloat, NDArrayInt

T = TypeVar("T", bound=FDataGrid)


def _coords_from_indices(
coord_indices: NDArrayInt,
grid_points: GridPoints,
) -> NDArrayFloat:
return np.stack([
grid_points[i][coord_index]
for i, coord_index in enumerate(coord_indices.T)
]).T


def _interpolate_nans(
fdatagrid: T,
) -> T:

data_matrix = fdatagrid.data_matrix.copy()

for n_sample in range(fdatagrid.n_samples):
for n_coord in range(fdatagrid.dim_codomain):

data_points = data_matrix[n_sample, ..., n_coord]
nan_pos = np.isnan(data_points)
valid_pos = ~nan_pos
coord_indices = np.argwhere(valid_pos)
desired_coord_indices = np.argwhere(nan_pos)
coords = _coords_from_indices(
coord_indices,
fdatagrid.grid_points,
)
desired_coords = _coords_from_indices(
desired_coord_indices,
fdatagrid.grid_points,
)
values = data_points[valid_pos]

if fdatagrid.dim_domain == 1:
interpolation = InterpolatedUnivariateSpline(
coords,
values,
k=1,
ext=3,
)
else:
interpolation = LinearNDInterpolator(
coords,
values,
)

new_values = interpolation(
desired_coords,
)

data_matrix[n_sample, nan_pos, n_coord] = new_values.ravel()

return fdatagrid.copy(data_matrix=data_matrix)


class MissingValuesInterpolation(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
D101 Missing docstring in public class

BaseEstimator,
InductiveTransformerMixin[T, T, Any],
):

def transform(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pep8] reported by reviewdog 🐶
D102 Missing docstring in public method

self,
X: T,
) -> T:
return _interpolate_nans(X)