NeuroTechX · brunaafl · Jun 6, 2024 · Jun 7, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/docs/source/evaluations.rst b/docs/source/evaluations.rst
@@ -19,6 +19,13 @@ Evaluations
     CrossSubjectEvaluation
 
 
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    WithinSessionSplitter
+
+
 ------------
 Base & Utils
 ------------

diff --git a/docs/source/images/withinsess.pdf b/docs/source/images/withinsess.pdf
diff --git a/docs/source/images/withinsess.png b/docs/source/images/withinsess.png
diff --git a/docs/source/whats_new.rst b/docs/source/whats_new.rst
@@ -17,6 +17,7 @@ Develop branch
 
 Enhancements
 ~~~~~~~~~~~~
+- Adding :class:`moabb.evaluations.splitters.WithinSessionSplitter` (:gh:`664` by `Bruna Lopes_`)
 
 Bugs
 ~~~~

diff --git a/images/withinsess.png b/images/withinsess.png
diff --git a/moabb/evaluations/__init__.py b/moabb/evaluations/__init__.py
@@ -9,4 +9,5 @@
     CrossSubjectEvaluation,
     WithinSessionEvaluation,
 )
+from .splitters import WithinSessionSplitter
 from .utils import create_save_path, save_model_cv, save_model_list
diff --git a/moabb/evaluations/splitters.py b/moabb/evaluations/splitters.py
@@ -0,0 +1,109 @@
+from sklearn.model_selection import BaseCrossValidator, StratifiedKFold
+from sklearn.utils import check_random_state
+
+
+class WithinSessionSplitter(BaseCrossValidator):
+    """Data splitter for within session evaluation.
+
+    Within-session evaluation uses k-fold cross_validation to determine train
+    and test sets on separate session for each subject. This splitter assumes that
+    all data from all subjects is already known and loaded.
-    and test sets on separate session for each subject. This splitter assumes that
-    all data from all subjects is already known and loaded.
+    and test sets for each subject in each session. This splitter
+    assumes that all data from all subjects is already known and loaded.
-    and test sets on separate session for each subject. This splitter assumes that
-    all data from all subjects is already known and loaded.
+    and test sets for each subject in each session. This splitter
+    assumes that all data from all subjects is already known and loaded.
+
+    .. image:: images/withinsess.png
+        :alt: The schematic diagram of the WithinSession split
+        :align: center
+
+
+    Parameters
+    ----------
+    n_folds : int
+        Number of folds. Must be at least 2.
-        Number of folds. Must be at least 2.
+        Number of folds for the within-session k-fold split. Must be at least 2.
-        Number of folds. Must be at least 2.
+        Number of folds for the within-session k-fold split. Must be at least 2.
+    random_state: int, RandomState instance or None, default=None
+        Important when `shuffle` is True. Controls the randomness of splits.
+        Pass an int for reproducible output across multiple function calls.
-    random_state: int, RandomState instance or None, default=None
-        Important when `shuffle` is True. Controls the randomness of splits.
-        Pass an int for reproducible output across multiple function calls.
+    random_state: int, RandomState instance or None, default=None
+        Controls the randomness of splits. Only used when `shuffle` is True.
+        Pass an int for reproducible output across multiple function calls.
-    random_state: int, RandomState instance or None, default=None
-        Important when `shuffle` is True. Controls the randomness of splits.
-        Pass an int for reproducible output across multiple function calls.
+    random_state: int, RandomState instance or None, default=None
+        Controls the randomness of splits. Only used when `shuffle` is True.
+        Pass an int for reproducible output across multiple function calls.
+    shuffle_session : bool, default=True
+        Whether to shuffle each class's samples before splitting into batches.
+        Note that the samples within each split will not be shuffled.
+    shuffle_subjects : bool, default=False
+        Apply shuffle in mixing subjects and sessions, this parameter allows
+        sample iterations of the sppliter.
+
+    Examples
+    -----------
+
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> from moabb.evaluations.splitters import WithinSessionSplitter
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [1,4], [7, 4], [5, 8], [0,3], [2,4]])
+    >>> y = np.array([1, 2, 1, 2, 1, 2, 1, 2])
+    >>> subjects = np.array([1, 1, 1, 1, 1, 1, 1, 1])
+    >>> sessions = np.array(['T', 'T', 'E', 'E', 'T', 'T', 'E', 'E'])
+    >>> metadata = pd.DataFrame(data={'subject': subjects, 'session': sessions})
+    >>> csess = WithinSessionSplitter(n_folds=2)
+    >>> csess.get_n_splits(metadata)
+    4
+    >>> for i, (train_index, test_index) in enumerate(csess.split(y, metadata)):
+    ...    print(f"Fold {i}:")
+    ...    print(f"  Train: index={train_index}, group={subjects[train_index]}, session={sessions[train_index]}")
+    ...    print(f"  Test:  index={test_index}, group={subjects[test_index]}, sessions={sessions[test_index]}")
+    Fold 0:
+      Train: index=[2 7], group=[1 1], session=['E' 'E']
+      Test:  index=[3 6], group=[1 1], sessions=['E' 'E']
+    Fold 1:
+      Train: index=[3 6], group=[1 1], session=['E' 'E']
+      Test:  index=[2 7], group=[1 1], sessions=['E' 'E']
+    Fold 2:
+      Train: index=[4 5], group=[1 1], session=['T' 'T']
+      Test:  index=[0 1], group=[1 1], sessions=['T' 'T']
+    Fold 3:
+      Train: index=[0 1], group=[1 1], session=['T' 'T']
+      Test:  index=[4 5], group=[1 1], sessions=['T' 'T']
+    """
+
+    def __init__(
+        self,
+        n_folds: int = 5,
+        random_state: int = 42,
+        shuffle_subjects: bool = False,
+        shuffle_session: bool = True,
-        random_state: int = 42,
-        shuffle_subjects: bool = False,
-        shuffle_session: bool = True,
+        shuffle_subjects: bool = False,
+        shuffle_session: bool = True,
+        random_state: int = None,
-        random_state: int = 42,
-        shuffle_subjects: bool = False,
-        shuffle_session: bool = True,
+        shuffle_subjects: bool = False,
+        shuffle_session: bool = True,
+        random_state: int = None,
+    ):
+        self.n_folds = n_folds
+        self.shuffle_subjects = shuffle_subjects
+        self.shuffle_session = shuffle_session
+        self.random_state = check_random_state(random_state)
+
+    def get_n_splits(self, metadata):
+        num_sessions_subjects = metadata.groupby(["subject", "session"]).ngroups
+        return self.n_folds * num_sessions_subjects
+
+    def split(self, y, metadata, **kwargs):
+        all_index = metadata.index.values
+        subjects = metadata.subject.unique()
-        subjects = metadata.subject.unique()
+        subjects = metadata['subject'].unique()
-        subjects = metadata.subject.unique()
+        subjects = metadata['subject'].unique()
+
+        # Shuffle subjects if required
+        if self.shuffle_subjects:
+            self.random_state.shuffle(subjects)
+
+        for subject in subjects:
+            subject_mask = metadata.subject == subject
+            subject_indices = all_index[subject_mask]
+            subject_metadata = metadata[subject_mask]
+            sessions = subject_metadata.session.unique()
+
+            # Shuffle sessions if required
+            if self.shuffle_session:
+                self.random_state.shuffle(sessions)
+
+            for session in sessions:
+                session_mask = subject_metadata.session == session
+                indices = subject_indices[session_mask]
+                group_y = y[indices]
+
+                # Use StratifiedKFold with the group-specific random state
+                cv = StratifiedKFold(
+                    n_splits=self.n_folds,
+                    shuffle=self.shuffle_session,
+                    random_state=self.random_state,
+                )
+                for ix_train, ix_test in cv.split(indices, group_y):
+                    yield indices[ix_train], indices[ix_test]
diff --git a/moabb/evaluations/utils.py b/moabb/evaluations/utils.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
+import re
 from pathlib import Path
 from pickle import HIGHEST_PROTOCOL, dump
 from typing import Sequence
 
+import numpy as np
 from numpy import argmax
 from sklearn.pipeline import Pipeline
 
@@ -222,6 +224,17 @@ def create_save_path(
         print("No hdf5_path provided, models will not be saved.")
 
 
+def sort_group(groups):
+    runs_sort = []
+    pattern = r"([0-9]+)(|[a-zA-Z]+[a-zA-Z0-9]*)"
+    for i, group in enumerate(groups):
+        index, description = re.fullmatch(pattern, group).groups()
+        index = int(index)
+        runs_sort.append(index)
+    sorted_ix = np.argsort(runs_sort)
+    return groups[sorted_ix]
+
+
 def _convert_sklearn_params_to_optuna(param_grid: dict) -> dict:
     """
     Function to convert the parameter in Optuna format. This function will

diff --git a/moabb/tests/splits.py b/moabb/tests/splits.py
@@ -0,0 +1,58 @@
+import numpy as np
+import pytest
+from sklearn.model_selection import StratifiedKFold
+from sklearn.utils import check_random_state
+
+from moabb.datasets.fake import FakeDataset
+from moabb.evaluations.splitters import WithinSessionSplitter
+from moabb.paradigms.motor_imagery import FakeImageryParadigm
+
+
+dataset = FakeDataset(["left_hand", "right_hand"], n_subjects=3, seed=12)
+paradigm = FakeImageryParadigm()
+
+
+# Split done for the Within Session evaluation
+def eval_split_within_session(shuffle, random_state):
+    random_state = check_random_state(random_state) if shuffle else None
+    for subject in dataset.subject_list:
+        X, y, metadata = paradigm.get_data(dataset=dataset, subjects=[subject])
+        sessions = metadata.session
+        for session in np.unique(sessions):
+            ix = sessions == session
+            cv = StratifiedKFold(n_splits=5, shuffle=shuffle, random_state=random_state)
+            X_, metadata_, y_ = X[ix], y[ix], metadata[ix]
+            for train, test in cv.split(y_, metadata_):
+                yield X_[train], X_[test]
+
+
+@pytest.mark.parametrize("shuffle", [True, False])
+@pytest.mark.parametrize("random_state", [0, 42])
+def test_within_session(shuffle, random_state):
+    X, y, metadata = paradigm.get_data(dataset=dataset)
+
+    split = WithinSessionSplitter(n_folds=5, shuffle=shuffle, random_state=random_state)
+
+    for (X_train_t, X_test_t), (train, test) in zip(
+        eval_split_within_session(shuffle=shuffle, random_state=random_state),
+        split.split(y, metadata),
+    ):
+        X_train, X_test = X[train], X[test]
+
+        # Check if the output is the same as the input
+        assert np.array_equal(X_train, X_train_t)
+        assert np.array_equal(X_test, X_test_t)
+
+
+def test_is_shuffling():
+    X, y, metadata = paradigm.get_data(dataset=dataset)
+
+    split = WithinSessionSplitter(n_folds=5, shuffle=False)
+    split_shuffle = WithinSessionSplitter(n_folds=5, shuffle=True, random_state=3)
+
+    for (train, test), (train_shuffle, test_shuffle) in zip(
+        split.split(y, metadata), split_shuffle.split(y, metadata)
+    ):
+        # Check if the output is the same as the input
+        assert np.array_equal(train, train_shuffle) == False
+        assert np.array_equal(test, test_shuffle) == False