neuroinformatics-unit · sfmig · Jun 28, 2024 · Jun 20, 2024 · Jun 20, 2024 · Jun 20, 2024
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -322,17 +322,44 @@ To add a new file, you will need to:
 2. Ask to be added as a collaborator on the [movement data repository](gin:neuroinformatics/movement-test-data) (if not already)
 3. Download the [GIN CLI](gin:G-Node/Info/wiki/GIN+CLI+Setup#quickstart) and set it up with your GIN credentials, by running `gin login` in a terminal.
 4. Clone the movement data repository to your local machine, by running `gin get neuroinformatics/movement-test-data` in a terminal.
-5. Add your new files to the `poses`, `videos`, and/or `frames` folders as appropriate. Follow the existing file naming conventions as closely as possible.
-6. Determine the sha256 checksum hash of each new file by running `sha256sum <filename>` in a terminal. For convenience, we've included a `get_sha256_hashes.py` script in the [movement data repository](gin:neuroinformatics/movement-test-data). If you run this from the root of the data repository, within a Python environment with `movement` installed, it will calculate the sha256 hashes for all files in the `poses`, `videos`, and `frames` folders and write them to files named `poses_hashes.txt`, `videos_hashes.txt`, and `frames_hashes.txt`, respectively.
+5. Add your new files to the `poses`, `videos`, `frames`, and/or `bboxes` folders as appropriate. Follow the existing file naming conventions as closely as possible.
+6. Determine the sha256 checksum hash of each new file. You can do this in a terminal by running:
+    ::::{tab-set}
+
+    :::{tab-item} Ubuntu
+      ```bash
+      sha256sum <filename>
+      ```
+    :::
+
+    :::{tab-item} MacOS
+    ```bash
+    shasum -a 256 <filename>
+    ```
+    :::
+
+    :::{tab-item} Windows
+    ```bash
+    certutil -hashfile <filename> SHA256
+    ```
+    :::
+    ::::
+    For convenience, we've included a `get_sha256_hashes.py` script in the [movement data repository](gin:neuroinformatics/movement-test-data). If you run this from the root of the data repository, within a Python environment with `movement` installed, it will calculate the sha256 hashes for all files in the `poses`, `videos`, `frames`, and `bboxes` folders and write them to files named `poses_hashes.txt`, `videos_hashes.txt`, `frames_hashes.txt`, and `bboxes_hashes.txt`, respectively.
+
 7. Add metadata for your new files to `metadata.yaml`, including their sha256 hashes you've calculated. See the example entry below for guidance.
+
 8. Commit a specific file with `gin commit -m <message> <filename>`, or `gin commit -m <message> .` to commit all changes.
+
 9. Upload the committed changes to the GIN repository by running `gin upload`. Latest changes to the repository can be pulled via `gin download`. `gin sync` will synchronise the latest changes bidirectionally.
 
+
+
 ### `metadata.yaml` example entry
 ```yaml
 "SLEAP_three-mice_Aeon_proofread.analysis.h5":
   sha256sum: "82ebd281c406a61536092863bc51d1a5c7c10316275119f7daf01c1ff33eac2a"
   source_software: "SLEAP"
+  type: "poses"  # "poses" or "bboxes" depending on the type of tracked data
   fps: 50
   species: "mouse"
   number_of_individuals: 3

diff --git a/movement/sample_data.py b/movement/sample_data.py
@@ -87,7 +87,7 @@ def _fetch_metadata(
     -------
     dict
         A dictionary containing metadata for each sample dataset, with the
-        dataset name (pose file name) as the key.
+        dataset file name as the key.
 
     """
     local_file_path = Path(data_dir / file_name)
@@ -116,7 +116,8 @@ def _fetch_metadata(
 def _generate_file_registry(metadata: dict[str, dict]) -> dict[str, str]:
     """Generate a file registry based on the contents of the metadata.
 
-    This includes files containing poses, frames, or entire videos.
+    This includes files containing poses, frames, videos, or bounding boxes
+    data.
 
     Parameters
     ----------
@@ -131,15 +132,15 @@ def _generate_file_registry(metadata: dict[str, dict]) -> dict[str, str]:
     """
     file_registry = {}
     for ds, val in metadata.items():
-        file_registry[f"poses/{ds}"] = val["sha256sum"]
+        file_registry[f"{val['type']}/{ds}"] = val["sha256sum"]
         for key in ["video", "frame"]:
             file_name = val[key]["file_name"]
             if file_name:
                 file_registry[f"{key}s/{file_name}"] = val[key]["sha256sum"]
     return file_registry
 
 
-# Create a download manager for the pose data
+# Create a download manager for the sample data
 metadata = _fetch_metadata(METADATA_FILE, DATA_DIR)
 file_registry = _generate_file_registry(metadata)
 SAMPLE_DATA = pooch.create(
@@ -151,19 +152,19 @@ def _generate_file_registry(metadata: dict[str, dict]) -> dict[str, str]:
 
 
 def list_datasets() -> list[str]:
-    """Find available sample datasets.
+    """List available sample datasets.
 
     Returns
     -------
     filenames : list of str
-        List of filenames for available pose data.
+        List of filenames for available sample datasets.
 
     """
     return list(metadata.keys())
 
 
 def fetch_dataset_paths(filename: str) -> dict:
-    """Get paths to sample pose data and any associated frames or videos.
+    """Get paths to sample dataset and any associated frames or videos.
 
     The data are downloaded from the ``movement`` data repository to the user's
     local machine upon first use and are stored in a local cache directory.
@@ -172,20 +173,21 @@ def fetch_dataset_paths(filename: str) -> dict:
     Parameters
     ----------
     filename : str
-        Name of the pose file to fetch.
+        Name of the sample data file to fetch.
 
     Returns
     -------
     paths : dict
         Dictionary mapping file types to their respective paths. The possible
-        file types are: "poses", "frame", "video". If "frame" or "video" are
-        not available, the corresponding value is None.
+        file types are: "poses", "frame", "video" or "bboxes". If "frame" or
+        "video" is not available, the corresponding value is None.
 
     Examples
     --------
     >>> from movement.sample_data import fetch_dataset_paths
     >>> paths = fetch_dataset_paths("DLC_single-mouse_EPM.predictions.h5")
-    >>> poses_path = paths["poses"]
+    >>> poses_path = paths["poses"]  # if the data is "pose" data
+    >>> bboxes_path = paths["bboxes"]  # if the data is "bboxes" data
     >>> frame_path = paths["frame"]
     >>> video_path = paths["video"]
 
@@ -194,21 +196,17 @@ def fetch_dataset_paths(filename: str) -> dict:
     fetch_dataset
 
     """
-    available_pose_files = list_datasets()
-    if filename not in available_pose_files:
+    available_data_files = list_datasets()
+    if filename not in available_data_files:
         raise log_error(
             ValueError,
             f"File '{filename}' is not in the registry. "
-            f"Valid filenames are: {available_pose_files}",
+            f"Valid filenames are: {available_data_files}",
         )
 
     frame_file_name = metadata[filename]["frame"]["file_name"]
     video_file_name = metadata[filename]["video"]["file_name"]
-
-    return {
-        "poses": Path(
-            SAMPLE_DATA.fetch(f"poses/{filename}", progressbar=True)
-        ),
+    paths_dict = {
         "frame": None
         if not frame_file_name
         else Path(
@@ -220,16 +218,23 @@ def fetch_dataset_paths(filename: str) -> dict:
             SAMPLE_DATA.fetch(f"videos/{video_file_name}", progressbar=True)
         ),
     }
+    # Add trajectory data
+    # Assume "poses" if not of type "bboxes"
+    data_type = "bboxes" if metadata[filename]["type"] == "bboxes" else "poses"
+    paths_dict[data_type] = Path(
+        SAMPLE_DATA.fetch(f"{data_type}/{filename}", progressbar=True)
+    )
+    return paths_dict
 
 
 def fetch_dataset(
     filename: str,
 ) -> xarray.Dataset:
-    """Load a sample dataset containing pose data.
+    """Load a sample dataset.
 
     The data are downloaded from the ``movement`` data repository to the user's
     local machine upon first use and are stored in a local cache directory.
-    This function returns the pose data as an xarray Dataset.
+    This function returns the data as an xarray Dataset.
     If there are any associated frames or videos, these files are also
     downloaded and the paths are stored as dataset attributes.
 
@@ -241,7 +246,7 @@ def fetch_dataset(
     Returns
     -------
     ds : xarray.Dataset
-        Pose data contained in the fetched sample file.
+        Data contained in the fetched sample file.
 
     Examples
     --------
@@ -262,6 +267,10 @@ def fetch_dataset(
         source_software=metadata[filename]["source_software"],
         fps=metadata[filename]["fps"],
     )
+
+    # TODO: Add support for loading bounding boxes data.
+    # Implemented in PR 229: https://github.com/neuroinformatics-unit/movement/pull/229
+
     ds.attrs["frame_path"] = file_paths["frame"]
     ds.attrs["video_path"] = file_paths["video"]
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -20,10 +20,11 @@ def pytest_configure():
     """Perform initial configuration for pytest.
     Fetches pose data file paths as a dictionary for tests.
     """
-    pytest.POSE_DATA_PATHS = {
-        file_name: fetch_dataset_paths(file_name)["poses"]
-        for file_name in list_datasets()
-    }
+    pytest.DATA_PATHS = {}
+    for file_name in list_datasets():
+        paths_dict = fetch_dataset_paths(file_name)
+        data_path = paths_dict.get("poses") or paths_dict.get("bboxes")
+        pytest.DATA_PATHS[file_name] = data_path
 
 
 @pytest.fixture(autouse=True)
@@ -194,9 +195,7 @@ def new_csv_file(tmp_path):
 @pytest.fixture
 def dlc_style_df():
     """Return a valid DLC-style DataFrame."""
-    return pd.read_hdf(
-        pytest.POSE_DATA_PATHS.get("DLC_single-wasp.predictions.h5")
-    )
+    return pd.read_hdf(pytest.DATA_PATHS.get("DLC_single-wasp.predictions.h5"))
 
 
 @pytest.fixture(
@@ -211,7 +210,7 @@ def dlc_style_df():
 )
 def sleap_file(request):
     """Return the file path for a SLEAP .h5 or .slp file."""
-    return pytest.POSE_DATA_PATHS.get(request.param)
+    return pytest.DATA_PATHS.get(request.param)
 
 
 @pytest.fixture

diff --git a/tests/test_integration/test_io.py b/tests/test_integration/test_io.py
@@ -2,7 +2,7 @@
 import numpy as np
 import pytest
 import xarray as xr
-from pytest import POSE_DATA_PATHS
+from pytest import DATA_PATHS
 
 from movement.io import load_poses, save_poses
 
@@ -62,7 +62,7 @@ def test_to_sleap_analysis_file_returns_same_h5_file_content(
         file) to a SLEAP-style .h5 analysis file returns the same file
         contents.
         """
-        sleap_h5_file_path = POSE_DATA_PATHS.get(sleap_h5_file)
+        sleap_h5_file_path = DATA_PATHS.get(sleap_h5_file)
         ds = load_poses.from_sleap_file(sleap_h5_file_path, fps=fps)
         save_poses.to_sleap_analysis_file(ds, new_h5_file)
 
@@ -93,7 +93,7 @@ def test_to_sleap_analysis_file_source_file(self, file, new_h5_file):
         to a SLEAP-style .h5 analysis file stores the .slp labels path
         only when the source file is a .slp file.
         """
-        file_path = POSE_DATA_PATHS.get(file)
+        file_path = DATA_PATHS.get(file)
         if file.startswith("DLC"):
             ds = load_poses.from_dlc_file(file_path)
         else:

diff --git a/tests/test_unit/test_load_poses.py b/tests/test_unit/test_load_poses.py
@@ -4,7 +4,7 @@
 import numpy as np
 import pytest
 import xarray as xr
-from pytest import POSE_DATA_PATHS
+from pytest import DATA_PATHS
 from sleap_io.io.slp import read_labels, write_labels
 from sleap_io.model.labels import LabeledFrame, Labels
 
@@ -18,9 +18,7 @@ class TestLoadPoses:
     @pytest.fixture
     def sleap_slp_file_without_tracks(self, tmp_path):
         """Mock and return the path to a SLEAP .slp file without tracks."""
-        sleap_file = POSE_DATA_PATHS.get(
-            "SLEAP_single-mouse_EPM.predictions.slp"
-        )
+        sleap_file = DATA_PATHS.get("SLEAP_single-mouse_EPM.predictions.slp")
         labels = read_labels(sleap_file)
         file_path = tmp_path / "track_is_none.slp"
         lfs = []
@@ -48,7 +46,7 @@ def sleap_slp_file_without_tracks(self, tmp_path):
     @pytest.fixture
     def sleap_h5_file_without_tracks(self, tmp_path):
         """Mock and return the path to a SLEAP .h5 file without tracks."""
-        sleap_file = POSE_DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
+        sleap_file = DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
         file_path = tmp_path / "track_is_none.h5"
         with h5py.File(sleap_file, "r") as f1, h5py.File(file_path, "w") as f2:
             for key in list(f1.keys()):
@@ -120,7 +118,7 @@ def test_load_from_sleap_file_without_tracks(
             sleap_file_without_tracks
         )
         ds_from_tracked = load_poses.from_sleap_file(
-            POSE_DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
+            DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
         )
         # Check if the "individuals" coordinate matches
         # the assigned default "individuals_0"
@@ -153,8 +151,8 @@ def test_load_from_sleap_slp_file_or_h5_file_returns_same(
         """Test that loading pose tracks from SLEAP .slp and .h5 files
         return the same Dataset.
         """
-        slp_file_path = POSE_DATA_PATHS.get(slp_file)
-        h5_file_path = POSE_DATA_PATHS.get(h5_file)
+        slp_file_path = DATA_PATHS.get(slp_file)
+        h5_file_path = DATA_PATHS.get(h5_file)
         ds_from_slp = load_poses.from_sleap_file(slp_file_path)
         ds_from_h5 = load_poses.from_sleap_file(h5_file_path)
         xr.testing.assert_allclose(ds_from_h5, ds_from_slp)
@@ -171,7 +169,7 @@ def test_load_from_dlc_file(self, file_name):
         """Test that loading pose tracks from valid DLC files
         returns a proper Dataset.
         """
-        file_path = POSE_DATA_PATHS.get(file_name)
+        file_path = DATA_PATHS.get(file_name)
         ds = load_poses.from_dlc_file(file_path)
         self.assert_dataset(ds, file_path, "DeepLabCut")
 
@@ -191,8 +189,8 @@ def test_load_from_dlc_file_csv_or_h5_file_returns_same(self):
         """Test that loading pose tracks from DLC .csv and .h5 files
         return the same Dataset.
         """
-        csv_file_path = POSE_DATA_PATHS.get("DLC_single-wasp.predictions.csv")
-        h5_file_path = POSE_DATA_PATHS.get("DLC_single-wasp.predictions.h5")
+        csv_file_path = DATA_PATHS.get("DLC_single-wasp.predictions.csv")
+        h5_file_path = DATA_PATHS.get("DLC_single-wasp.predictions.h5")
         ds_from_csv = load_poses.from_dlc_file(csv_file_path)
         ds_from_h5 = load_poses.from_dlc_file(h5_file_path)
         xr.testing.assert_allclose(ds_from_h5, ds_from_csv)
@@ -210,7 +208,7 @@ def test_load_from_dlc_file_csv_or_h5_file_returns_same(self):
     def test_fps_and_time_coords(self, fps, expected_fps, expected_time_unit):
         """Test that time coordinates are set according to the provided fps."""
         ds = load_poses.from_sleap_file(
-            POSE_DATA_PATHS.get("SLEAP_three-mice_Aeon_proofread.analysis.h5"),
+            DATA_PATHS.get("SLEAP_three-mice_Aeon_proofread.analysis.h5"),
             fps=fps,
         )
         assert ds.time_unit == expected_time_unit
@@ -234,7 +232,7 @@ def test_load_from_lp_file(self, file_name):
         """Test that loading pose tracks from valid LightningPose (LP) files
         returns a proper Dataset.
         """
-        file_path = POSE_DATA_PATHS.get(file_name)
+        file_path = DATA_PATHS.get(file_name)
         ds = load_poses.from_lp_file(file_path)
         self.assert_dataset(ds, file_path, "LightningPose")
 
@@ -243,7 +241,7 @@ def test_load_from_lp_or_dlc_file_returns_same(self):
         using either the `from_lp_file` or `from_dlc_file` function
         returns the same Dataset (except for the source_software).
         """
-        file_path = POSE_DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv")
+        file_path = DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv")
         ds_drom_lp = load_poses.from_lp_file(file_path)
         ds_from_dlc = load_poses.from_dlc_file(file_path)
         xr.testing.assert_allclose(ds_from_dlc, ds_drom_lp)
@@ -254,7 +252,7 @@ def test_load_multi_individual_from_lp_file_raises(self):
         """Test that loading a multi-individual .csv file using the
         `from_lp_file` function raises a ValueError.
         """
-        file_path = POSE_DATA_PATHS.get("DLC_two-mice.predictions.csv")
+        file_path = DATA_PATHS.get("DLC_two-mice.predictions.csv")
         with pytest.raises(ValueError):
             load_poses.from_lp_file(file_path)
 

diff --git a/tests/test_unit/test_sample_data.py b/tests/test_unit/test_sample_data.py
@@ -38,6 +38,7 @@ def validate_metadata(metadata: dict[str, dict]) -> None:
     """Assert that the metadata is in the expected format."""
     metadata_fields = [
         "sha256sum",
+        "type",
         "source_software",
         "fps",
         "species",
@@ -59,9 +60,9 @@ def validate_metadata(metadata: dict[str, dict]) -> None:
     ), f"Expected metadata values to be dicts. {check_yaml_msg}"
     assert all(
         set(val.keys()) == set(metadata_fields) for val in metadata.values()
-    ), f"Found issues with the names of medatada fields. {check_yaml_msg}"
+    ), f"Found issues with the names of metadata fields. {check_yaml_msg}"
 
-    # check that metadata keys (pose file names) are unique
+    # check that metadata keys (file names) are unique
     assert len(metadata.keys()) == len(set(metadata.keys()))
 
     # check that the first 2 fields are present and are strings