scikit-adaptation · kachayev · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 26, 2024
diff --git a/skada/__init__.py b/skada/__init__.py
@@ -9,6 +9,7 @@
 from .version import __version__  # noqa: F401
 from . import model_selection
 from . import metrics
+from . import utils
 from .base import BaseAdapter, PerDomain, Shared
 from ._mapping import (
     ClassRegularizerOTMappingAdapter,
@@ -50,6 +51,7 @@
 __all__ = [
     "metrics",
     "model_selection",
+    "utils",
 
     "BaseAdapter",
     "PerDomain",

diff --git a/skada/tests/test_utils.py b/skada/tests/test_utils.py
@@ -10,6 +10,7 @@
     make_dataset_from_moons_distribution
 )
 from skada.utils import (
+    check_sample_domain,
     check_X_y_domain,
     check_X_domain,
     extract_source_indices,
@@ -149,10 +150,14 @@ def test_check_X_y_allow_exceptions():
 
     positive_numbers = random_sample_domain[random_sample_domain > 0]
     negative_numbers = random_sample_domain[random_sample_domain < 0]
-    # Count unique positive numbers
+
+    # Count number of sources and targets
     n_sources = len(np.unique(positive_numbers))
     n_targets = len(np.unique(negative_numbers))
 
+    # Adjust targets to avoid sample_domain checker raise issues
+    random_sample_domain[random_sample_domain < 0] -= (n_sources + 1)
+
     with pytest.raises(
         ValueError,
         match=(
@@ -170,7 +175,7 @@ def test_check_X_y_allow_exceptions():
         match=(
             f"Number of targets provided is {n_targets} "
             f"and 'allow_target' is set to {allow_target}"
-            )
+        )
     ):
         check_X_y_domain(
             X, y, sample_domain=random_sample_domain,
@@ -225,10 +230,13 @@ def test_check_X_allow_exceptions():
     positive_numbers = random_sample_domain[random_sample_domain > 0]
     negative_numbers = random_sample_domain[random_sample_domain < 0]
 
-    # Count unique positive numbers
+    # Count number of sources and targets
     n_sources = len(np.unique(positive_numbers))
     n_targets = len(np.unique(negative_numbers))
 
+    # Adjust targets to avoid sample_domain checker raise issues
+    random_sample_domain[random_sample_domain < 0] -= (n_sources + 1)
+
     with pytest.raises(
         ValueError,
         match=(
@@ -237,8 +245,10 @@ def test_check_X_allow_exceptions():
         )
     ):
         check_X_domain(
-            X, sample_domain=random_sample_domain,
-            allow_auto_sample_domain=False, allow_source=allow_source
+            X,
+            sample_domain=random_sample_domain,
+            allow_auto_sample_domain=False,
+            allow_source=allow_source
         )
 
     with pytest.raises(
@@ -249,8 +259,10 @@ def test_check_X_allow_exceptions():
         )
     ):
         check_X_domain(
-            X, sample_domain=random_sample_domain,
-            allow_auto_sample_domain=False, allow_target=allow_target
+            X,
+            sample_domain=random_sample_domain,
+            allow_auto_sample_domain=False,
+            allow_target=allow_target
         )
 
     with pytest.raises(
@@ -261,8 +273,10 @@ def test_check_X_allow_exceptions():
         )
     ):
         check_X_domain(
-            X, sample_domain=random_sample_domain,
-            allow_auto_sample_domain=False, allow_multi_source=allow_multi_source
+            X,
+            sample_domain=random_sample_domain,
+            allow_auto_sample_domain=False,
+            allow_multi_source=allow_multi_source
         )
 
     with pytest.raises(
@@ -273,8 +287,10 @@ def test_check_X_allow_exceptions():
         )
     ):
         check_X_domain(
-            X, sample_domain=random_sample_domain,
-            allow_auto_sample_domain=False, allow_multi_target=allow_multi_target
+            X,
+            sample_domain=random_sample_domain,
+            allow_auto_sample_domain=False,
+            allow_multi_target=allow_multi_target
         )
 
 
@@ -377,19 +393,21 @@ def test_source_target_merge():
     )
 
     # Test consistent length
-    with pytest.raises(ValueError,
-                       match="Inconsistent number of samples in source-target arrays "
-                       "and the number infered in the sample_domain"
-                       ):
+    with pytest.raises(
+        ValueError,
+        match="Inconsistent number of samples in source-target arrays "
+              "and the number infered in the sample_domain"
+    ):
         _ = source_target_merge(X_source[0], X_target[1], sample_domain=sample_domain)
 
     # Test no sample domain
     _ = source_target_merge(X_source, X_target, sample_domain=None)
 
     # Test odd number of array
-    with pytest.raises(ValueError,
-                       match="Even number of arrays required as input"
-                       ):
+    with pytest.raises(
+        ValueError,
+        match="Even number of arrays required as input"
+    ):
         _ = source_target_merge(
             X_source,
             X_target,
@@ -407,16 +425,18 @@ def test_source_target_merge():
     )
 
     # Test one array
-    with pytest.raises(ValueError,
-                       match="At least two array required as input"
-                       ):
+    with pytest.raises(
+        ValueError,
+        match="At least two array required as input"
+    ):
         _ = source_target_merge(X_source, sample_domain=sample_domain)
 
     # Test y_target = None + Inconsistent number of samples in source-target
-    with pytest.raises(ValueError,
-                       match="Inconsistent number of samples in source-target arrays "
-                       "and the number infered in the sample_domain"
-                       ):
+    with pytest.raises(
+        ValueError,
+        match="Inconsistent number of samples in source-target arrays "
+              "and the number infered in the sample_domain"
+    ):
         _ = source_target_merge(
             X_source,
             X_target,
@@ -435,20 +455,31 @@ def test_source_target_merge():
     )
 
     # Test 2 None in a pair of arrays
-    with pytest.raises(ValueError,
-                       match="Only one array can be None or empty in each pair"
-                       ):
+    with pytest.raises(
+        ValueError,
+        match="Only one array can be None or empty in each pair"
+    ):
         _ = source_target_merge(None, None, sample_domain=sample_domain)
 
     # Test 1 None in 2 pair of arrays
     _ = source_target_merge(X_source, None, y_source, None, sample_domain=sample_domain)
 
     # Test inconsistent number of features
-    with pytest.raises(ValueError,
-                       match="Inconsistent number of features in source-target arrays"
-                       ):
+    with pytest.raises(
+        ValueError,
+        match="Inconsistent number of features in source-target arrays"
+    ):
         _ = source_target_merge(
             X_source[:, :-1],
             X_target,
             sample_domain=sample_domain
         )
+
+
+def test_check_sample_domain_lodo():
+    # same domain label added twice (as a source and as a target)
+    check_sample_domain(np.array([1, 1, 2, 2, 2, -1, -1, -2, -2, -2]))
+
+    # 'lodo' packing but counters are off
+    with pytest.raises(ValueError):
+        check_sample_domain(np.array([1, 1, -1, 2, 2, 2, -2, -2]))
diff --git a/skada/utils.py b/skada/utils.py
@@ -129,7 +129,7 @@ def check_X_domain(
     separately to avoid additional scan for 'sample_domain' array.
 
     Parameters:
-    ----------
+    -----------
     X : array-like of shape (n_samples, n_features)
         Input features.
     sample_domain : array-like of shape (n_samples,)
@@ -148,7 +148,7 @@ def check_X_domain(
         Allow automatic generation of sample_domain if not provided.
 
     Returns:
-    ----------
+    --------
     X : array
         Input features.
     sample_domain : array
@@ -196,26 +196,53 @@ def check_X_domain(
     return X, sample_domain
 
 
-def extract_source_indices(sample_domain):
-    """Extract the indices of the source samples.
+def check_sample_domain(sample_domain):
+    """Validate `sample_domain` parameter for domain adaptation.
+
+    Valid `sample_domain` array contains each domain label either as a
+    source (positive) or as a target (negative). The only exception,
+    as of now, is 'lodo' (Leave-One-Domain-Out) packing that contains
+    each domain label twice (both as positive and negative).
 
     Parameters:
-    ----------
+    -----------
     sample_domain : array-like of shape (n_samples,)
-        Array specifying the domain labels for each sample.
+        Domain labels for each sample.
 
     Returns:
-    ----------
-    source_idx : array
-        Boolean array indicating source indices.
+    --------
+    sample_domain : array of shape (n_samples,)
+        Domain labels for for each sample.
     """
     sample_domain = check_array(
         sample_domain,
         dtype=np.int32,
         ensure_2d=False,
         input_name='sample_domain'
     )
+    indices, counters = np.unique(sample_domain, return_counts=True)
+    if any(-idx in indices for idx in indices):
+        counts = dict(zip(indices, counters))
+        for idx in indices:
+            if -idx not in counts or counts[idx] != counts[-idx]:
+                raise ValueError("Invalid 'sample_domain' array structure.")
+    return sample_domain
+
 
+def extract_source_indices(sample_domain):
+    """Extract the indices of the source samples.
+
+    Parameters:
+    ----------
+    sample_domain : array-like of shape (n_samples,)
+        Array specifying the domain labels for each sample.
+
+    Returns:
+    ----------
+    source_idx : array
+        Boolean array indicating source indices.
+    """
+    sample_domain = check_sample_domain(sample_domain)
     source_idx = (sample_domain >= 0)
     return source_idx