Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bboxes sample data #231

Merged
merged 12 commits into from
Jun 28, 2024
31 changes: 29 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,17 +322,44 @@ To add a new file, you will need to:
2. Ask to be added as a collaborator on the [movement data repository](gin:neuroinformatics/movement-test-data) (if not already)
3. Download the [GIN CLI](gin:G-Node/Info/wiki/GIN+CLI+Setup#quickstart) and set it up with your GIN credentials, by running `gin login` in a terminal.
4. Clone the movement data repository to your local machine, by running `gin get neuroinformatics/movement-test-data` in a terminal.
5. Add your new files to the `poses`, `videos`, and/or `frames` folders as appropriate. Follow the existing file naming conventions as closely as possible.
6. Determine the sha256 checksum hash of each new file by running `sha256sum <filename>` in a terminal. For convenience, we've included a `get_sha256_hashes.py` script in the [movement data repository](gin:neuroinformatics/movement-test-data). If you run this from the root of the data repository, within a Python environment with `movement` installed, it will calculate the sha256 hashes for all files in the `poses`, `videos`, and `frames` folders and write them to files named `poses_hashes.txt`, `videos_hashes.txt`, and `frames_hashes.txt`, respectively.
5. Add your new files to the `poses`, `videos`, `frames`, and/or `bboxes` folders as appropriate. Follow the existing file naming conventions as closely as possible.
6. Determine the sha256 checksum hash of each new file. You can do this in a terminal by running:
::::{tab-set}

:::{tab-item} Ubuntu
```bash
sha256sum <filename>
```
:::

:::{tab-item} MacOS
```bash
shasum -a 256 <filename>
```
:::

:::{tab-item} Windows
```bash
certutil -hashfile <filename> SHA256
```
:::
::::
For convenience, we've included a `get_sha256_hashes.py` script in the [movement data repository](gin:neuroinformatics/movement-test-data). If you run this from the root of the data repository, within a Python environment with `movement` installed, it will calculate the sha256 hashes for all files in the `poses`, `videos`, `frames`, and `bboxes` folders and write them to files named `poses_hashes.txt`, `videos_hashes.txt`, `frames_hashes.txt`, and `bboxes_hashes.txt`, respectively.

7. Add metadata for your new files to `metadata.yaml`, including their sha256 hashes you've calculated. See the example entry below for guidance.

8. Commit a specific file with `gin commit -m <message> <filename>`, or `gin commit -m <message> .` to commit all changes.

9. Upload the committed changes to the GIN repository by running `gin upload`. Latest changes to the repository can be pulled via `gin download`. `gin sync` will synchronise the latest changes bidirectionally.



### `metadata.yaml` example entry
```yaml
"SLEAP_three-mice_Aeon_proofread.analysis.h5":
sha256sum: "82ebd281c406a61536092863bc51d1a5c7c10316275119f7daf01c1ff33eac2a"
source_software: "SLEAP"
type: "poses" # "poses" or "bboxes" depending on the type of tracked data
lochhh marked this conversation as resolved.
Show resolved Hide resolved
fps: 50
species: "mouse"
number_of_individuals: 3
Expand Down
53 changes: 31 additions & 22 deletions movement/sample_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def _fetch_metadata(
-------
dict
A dictionary containing metadata for each sample dataset, with the
dataset name (pose file name) as the key.
dataset file name as the key.

"""
local_file_path = Path(data_dir / file_name)
Expand Down Expand Up @@ -116,7 +116,8 @@ def _fetch_metadata(
def _generate_file_registry(metadata: dict[str, dict]) -> dict[str, str]:
"""Generate a file registry based on the contents of the metadata.

This includes files containing poses, frames, or entire videos.
This includes files containing poses, frames, videos, or bounding boxes
data.

Parameters
----------
Expand All @@ -131,15 +132,15 @@ def _generate_file_registry(metadata: dict[str, dict]) -> dict[str, str]:
"""
file_registry = {}
for ds, val in metadata.items():
file_registry[f"poses/{ds}"] = val["sha256sum"]
file_registry[f"{val['type']}/{ds}"] = val["sha256sum"]
for key in ["video", "frame"]:
file_name = val[key]["file_name"]
if file_name:
file_registry[f"{key}s/{file_name}"] = val[key]["sha256sum"]
return file_registry


# Create a download manager for the pose data
# Create a download manager for the sample data
metadata = _fetch_metadata(METADATA_FILE, DATA_DIR)
file_registry = _generate_file_registry(metadata)
SAMPLE_DATA = pooch.create(
Expand All @@ -151,19 +152,19 @@ def _generate_file_registry(metadata: dict[str, dict]) -> dict[str, str]:


def list_datasets() -> list[str]:
"""Find available sample datasets.
"""List available sample datasets.

Returns
-------
filenames : list of str
List of filenames for available pose data.
List of filenames for available sample datasets.

"""
return list(metadata.keys())


def fetch_dataset_paths(filename: str) -> dict:
"""Get paths to sample pose data and any associated frames or videos.
"""Get paths to sample dataset and any associated frames or videos.

The data are downloaded from the ``movement`` data repository to the user's
local machine upon first use and are stored in a local cache directory.
Expand All @@ -172,20 +173,21 @@ def fetch_dataset_paths(filename: str) -> dict:
Parameters
----------
filename : str
Name of the pose file to fetch.
Name of the sample data file to fetch.

Returns
-------
paths : dict
Dictionary mapping file types to their respective paths. The possible
file types are: "poses", "frame", "video". If "frame" or "video" are
not available, the corresponding value is None.
file types are: "poses", "frame", "video" or "bboxes". If "frame" or
"video" is not available, the corresponding value is None.

Examples
--------
>>> from movement.sample_data import fetch_dataset_paths
>>> paths = fetch_dataset_paths("DLC_single-mouse_EPM.predictions.h5")
>>> poses_path = paths["poses"]
>>> poses_path = paths["poses"] # if the data is "pose" data
>>> bboxes_path = paths["bboxes"] # if the data is "bboxes" data
>>> frame_path = paths["frame"]
>>> video_path = paths["video"]

Expand All @@ -194,21 +196,17 @@ def fetch_dataset_paths(filename: str) -> dict:
fetch_dataset

"""
available_pose_files = list_datasets()
if filename not in available_pose_files:
available_data_files = list_datasets()
if filename not in available_data_files:
raise log_error(
ValueError,
f"File '{filename}' is not in the registry. "
f"Valid filenames are: {available_pose_files}",
f"Valid filenames are: {available_data_files}",
)

frame_file_name = metadata[filename]["frame"]["file_name"]
video_file_name = metadata[filename]["video"]["file_name"]

return {
"poses": Path(
SAMPLE_DATA.fetch(f"poses/{filename}", progressbar=True)
),
paths_dict = {
"frame": None
if not frame_file_name
else Path(
Expand All @@ -220,16 +218,23 @@ def fetch_dataset_paths(filename: str) -> dict:
SAMPLE_DATA.fetch(f"videos/{video_file_name}", progressbar=True)
),
}
# Add trajectory data
# Assume "poses" if not of type "bboxes"
data_type = "bboxes" if metadata[filename]["type"] == "bboxes" else "poses"
paths_dict[data_type] = Path(
SAMPLE_DATA.fetch(f"{data_type}/{filename}", progressbar=True)
)
return paths_dict


def fetch_dataset(
filename: str,
) -> xarray.Dataset:
"""Load a sample dataset containing pose data.
"""Load a sample dataset.

The data are downloaded from the ``movement`` data repository to the user's
local machine upon first use and are stored in a local cache directory.
This function returns the pose data as an xarray Dataset.
This function returns the data as an xarray Dataset.
If there are any associated frames or videos, these files are also
downloaded and the paths are stored as dataset attributes.

Expand All @@ -241,7 +246,7 @@ def fetch_dataset(
Returns
-------
ds : xarray.Dataset
Pose data contained in the fetched sample file.
Data contained in the fetched sample file.

Examples
--------
Expand All @@ -262,6 +267,10 @@ def fetch_dataset(
source_software=metadata[filename]["source_software"],
fps=metadata[filename]["fps"],
)

# TODO: Add support for loading bounding boxes data.
# Implemented in PR 229: https://github.com/neuroinformatics-unit/movement/pull/229

ds.attrs["frame_path"] = file_paths["frame"]
ds.attrs["video_path"] = file_paths["video"]

Expand Down
15 changes: 7 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ def pytest_configure():
"""Perform initial configuration for pytest.
Fetches pose data file paths as a dictionary for tests.
"""
pytest.POSE_DATA_PATHS = {
file_name: fetch_dataset_paths(file_name)["poses"]
for file_name in list_datasets()
}
pytest.DATA_PATHS = {}
for file_name in list_datasets():
paths_dict = fetch_dataset_paths(file_name)
data_path = paths_dict.get("poses") or paths_dict.get("bboxes")
pytest.DATA_PATHS[file_name] = data_path


@pytest.fixture(autouse=True)
Expand Down Expand Up @@ -194,9 +195,7 @@ def new_csv_file(tmp_path):
@pytest.fixture
def dlc_style_df():
"""Return a valid DLC-style DataFrame."""
return pd.read_hdf(
pytest.POSE_DATA_PATHS.get("DLC_single-wasp.predictions.h5")
)
return pd.read_hdf(pytest.DATA_PATHS.get("DLC_single-wasp.predictions.h5"))


@pytest.fixture(
Expand All @@ -211,7 +210,7 @@ def dlc_style_df():
)
def sleap_file(request):
"""Return the file path for a SLEAP .h5 or .slp file."""
return pytest.POSE_DATA_PATHS.get(request.param)
return pytest.DATA_PATHS.get(request.param)


@pytest.fixture
Expand Down
6 changes: 3 additions & 3 deletions tests/test_integration/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
import pytest
import xarray as xr
from pytest import POSE_DATA_PATHS
from pytest import DATA_PATHS

from movement.io import load_poses, save_poses

Expand Down Expand Up @@ -62,7 +62,7 @@ def test_to_sleap_analysis_file_returns_same_h5_file_content(
file) to a SLEAP-style .h5 analysis file returns the same file
contents.
"""
sleap_h5_file_path = POSE_DATA_PATHS.get(sleap_h5_file)
sleap_h5_file_path = DATA_PATHS.get(sleap_h5_file)
ds = load_poses.from_sleap_file(sleap_h5_file_path, fps=fps)
save_poses.to_sleap_analysis_file(ds, new_h5_file)

Expand Down Expand Up @@ -93,7 +93,7 @@ def test_to_sleap_analysis_file_source_file(self, file, new_h5_file):
to a SLEAP-style .h5 analysis file stores the .slp labels path
only when the source file is a .slp file.
"""
file_path = POSE_DATA_PATHS.get(file)
file_path = DATA_PATHS.get(file)
if file.startswith("DLC"):
ds = load_poses.from_dlc_file(file_path)
else:
Expand Down
28 changes: 13 additions & 15 deletions tests/test_unit/test_load_poses.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pytest
import xarray as xr
from pytest import POSE_DATA_PATHS
from pytest import DATA_PATHS
from sleap_io.io.slp import read_labels, write_labels
from sleap_io.model.labels import LabeledFrame, Labels

Expand All @@ -18,9 +18,7 @@ class TestLoadPoses:
@pytest.fixture
def sleap_slp_file_without_tracks(self, tmp_path):
"""Mock and return the path to a SLEAP .slp file without tracks."""
sleap_file = POSE_DATA_PATHS.get(
"SLEAP_single-mouse_EPM.predictions.slp"
)
sleap_file = DATA_PATHS.get("SLEAP_single-mouse_EPM.predictions.slp")
labels = read_labels(sleap_file)
file_path = tmp_path / "track_is_none.slp"
lfs = []
Expand Down Expand Up @@ -48,7 +46,7 @@ def sleap_slp_file_without_tracks(self, tmp_path):
@pytest.fixture
def sleap_h5_file_without_tracks(self, tmp_path):
"""Mock and return the path to a SLEAP .h5 file without tracks."""
sleap_file = POSE_DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
sleap_file = DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
file_path = tmp_path / "track_is_none.h5"
with h5py.File(sleap_file, "r") as f1, h5py.File(file_path, "w") as f2:
for key in list(f1.keys()):
Expand Down Expand Up @@ -120,7 +118,7 @@ def test_load_from_sleap_file_without_tracks(
sleap_file_without_tracks
)
ds_from_tracked = load_poses.from_sleap_file(
POSE_DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
)
# Check if the "individuals" coordinate matches
# the assigned default "individuals_0"
Expand Down Expand Up @@ -153,8 +151,8 @@ def test_load_from_sleap_slp_file_or_h5_file_returns_same(
"""Test that loading pose tracks from SLEAP .slp and .h5 files
return the same Dataset.
"""
slp_file_path = POSE_DATA_PATHS.get(slp_file)
h5_file_path = POSE_DATA_PATHS.get(h5_file)
slp_file_path = DATA_PATHS.get(slp_file)
h5_file_path = DATA_PATHS.get(h5_file)
ds_from_slp = load_poses.from_sleap_file(slp_file_path)
ds_from_h5 = load_poses.from_sleap_file(h5_file_path)
xr.testing.assert_allclose(ds_from_h5, ds_from_slp)
Expand All @@ -171,7 +169,7 @@ def test_load_from_dlc_file(self, file_name):
"""Test that loading pose tracks from valid DLC files
returns a proper Dataset.
"""
file_path = POSE_DATA_PATHS.get(file_name)
file_path = DATA_PATHS.get(file_name)
ds = load_poses.from_dlc_file(file_path)
self.assert_dataset(ds, file_path, "DeepLabCut")

Expand All @@ -191,8 +189,8 @@ def test_load_from_dlc_file_csv_or_h5_file_returns_same(self):
"""Test that loading pose tracks from DLC .csv and .h5 files
return the same Dataset.
"""
csv_file_path = POSE_DATA_PATHS.get("DLC_single-wasp.predictions.csv")
h5_file_path = POSE_DATA_PATHS.get("DLC_single-wasp.predictions.h5")
csv_file_path = DATA_PATHS.get("DLC_single-wasp.predictions.csv")
h5_file_path = DATA_PATHS.get("DLC_single-wasp.predictions.h5")
ds_from_csv = load_poses.from_dlc_file(csv_file_path)
ds_from_h5 = load_poses.from_dlc_file(h5_file_path)
xr.testing.assert_allclose(ds_from_h5, ds_from_csv)
Expand All @@ -210,7 +208,7 @@ def test_load_from_dlc_file_csv_or_h5_file_returns_same(self):
def test_fps_and_time_coords(self, fps, expected_fps, expected_time_unit):
"""Test that time coordinates are set according to the provided fps."""
ds = load_poses.from_sleap_file(
POSE_DATA_PATHS.get("SLEAP_three-mice_Aeon_proofread.analysis.h5"),
DATA_PATHS.get("SLEAP_three-mice_Aeon_proofread.analysis.h5"),
fps=fps,
)
assert ds.time_unit == expected_time_unit
Expand All @@ -234,7 +232,7 @@ def test_load_from_lp_file(self, file_name):
"""Test that loading pose tracks from valid LightningPose (LP) files
returns a proper Dataset.
"""
file_path = POSE_DATA_PATHS.get(file_name)
file_path = DATA_PATHS.get(file_name)
ds = load_poses.from_lp_file(file_path)
self.assert_dataset(ds, file_path, "LightningPose")

Expand All @@ -243,7 +241,7 @@ def test_load_from_lp_or_dlc_file_returns_same(self):
using either the `from_lp_file` or `from_dlc_file` function
returns the same Dataset (except for the source_software).
"""
file_path = POSE_DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv")
file_path = DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv")
ds_drom_lp = load_poses.from_lp_file(file_path)
ds_from_dlc = load_poses.from_dlc_file(file_path)
xr.testing.assert_allclose(ds_from_dlc, ds_drom_lp)
Expand All @@ -254,7 +252,7 @@ def test_load_multi_individual_from_lp_file_raises(self):
"""Test that loading a multi-individual .csv file using the
`from_lp_file` function raises a ValueError.
"""
file_path = POSE_DATA_PATHS.get("DLC_two-mice.predictions.csv")
file_path = DATA_PATHS.get("DLC_two-mice.predictions.csv")
with pytest.raises(ValueError):
load_poses.from_lp_file(file_path)

Expand Down
5 changes: 3 additions & 2 deletions tests/test_unit/test_sample_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def validate_metadata(metadata: dict[str, dict]) -> None:
"""Assert that the metadata is in the expected format."""
metadata_fields = [
"sha256sum",
"type",
"source_software",
"fps",
"species",
Expand All @@ -59,9 +60,9 @@ def validate_metadata(metadata: dict[str, dict]) -> None:
), f"Expected metadata values to be dicts. {check_yaml_msg}"
assert all(
set(val.keys()) == set(metadata_fields) for val in metadata.values()
), f"Found issues with the names of medatada fields. {check_yaml_msg}"
), f"Found issues with the names of metadata fields. {check_yaml_msg}"

# check that metadata keys (pose file names) are unique
# check that metadata keys (file names) are unique
assert len(metadata.keys()) == len(set(metadata.keys()))

# check that the first 2 fields are present and are strings
Expand Down
Loading