Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example Dataset - Spatial Datasets #832

Merged
merged 9 commits into from
Nov 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,4 @@ Pipfile.lock
.coverage.*
.DS_Store

data/example_dataset/image_data/
data/example_dataset/segmentation/
data/example_dataset/pixie/
data/example_dataset/post_clustering/
data/*
9 changes: 5 additions & 4 deletions ark/analysis/spatial_analysis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from itertools import combinations_with_replacement
import os
from itertools import combinations_with_replacement

import numpy as np
import pandas as pd
Expand All @@ -20,7 +20,7 @@ def generate_channel_spatial_enrichment_stats(label_dir, dist_mat_dir, marker_th
directory containing labeled tiffs
dist_mat_dir (str | Pathlike):
directory containing the distance matrices
marker_thresholds (numpy.ndarray):
marker_thresholds (pd.DataFrame):
threshold values for positive marker expression
all_data (pandas.DataFrame):
data including fovs, cell labels, and cell expression matrix for all markers
Expand Down Expand Up @@ -107,7 +107,7 @@ def calculate_channel_spatial_enrichment(fov, dist_matrix, marker_thresholds, al
dist_matrix (xarray.DataArray):
a cells x cells matrix with the euclidian distance between centers of
corresponding cells for the FOV
marker_thresholds (numpy.ndarray):
marker_thresholds (pd.DataFrame):
threshold values for positive marker expression
all_data (pandas.DataFrame):
data including fovs, cell labels, and cell expression matrix for all markers
Expand Down Expand Up @@ -149,8 +149,9 @@ def calculate_channel_spatial_enrichment(fov, dist_matrix, marker_thresholds, al
all_channel_data = all_data.iloc[:, channel_start:channel_end]
if excluded_channels is not None:
all_channel_data = all_channel_data.drop(excluded_channels, axis=1)
marker_thresholds = marker_thresholds[~marker_thresholds["marker"].isin(excluded_channels)]

# check that the markers are the same in marker_thresholdsa and all_channel_data
# check that the markers are the same in marker_thresholds and all_channel_data
misc_utils.verify_same_elements(markers_to_threshold=marker_thresholds.iloc[:, 0].values,
all_markers=all_channel_data.columns.values)

Expand Down
2 changes: 1 addition & 1 deletion ark/analysis/spatial_analysis_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def test_calculate_channel_spatial_enrichment():

with pytest.raises(ValueError):
# attempt to include marker thresholds and marker columns that do not exist
bad_marker_thresholds = pd.DataFrame(np.zeros((21, 2)))
bad_marker_thresholds = pd.DataFrame(np.zeros((21, 2)), columns=["marker", "threshold"])
bad_marker_thresholds.iloc[:, 1] = .5
bad_marker_thresholds.iloc[:, 0] = np.arange(10, 31) + 2

Expand Down
19 changes: 17 additions & 2 deletions ark/utils/example_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pathlib
import shutil
from typing import Union
import warnings
from typing import Union

import datasets

Expand All @@ -20,6 +20,11 @@ def __init__(self, dataset: str, overwrite_existing: bool = True, cache_dir: str
* `"cluster_pixels"`
* `"cluster_cells"`
* `"post_clustering"`
* `"fiber_segmentation"`
* `"LDA_preprocessing"`
* `"LDA_training_inference"`
* `"neighborhood_analysis"`
* `"pairwise_spatial_enrichment"`
overwrite_existing (bool): A flag to overwrite existing data. Defaults to `True`.
cache_dir (str, optional): The directory to save the cache dir. Defaults to `None`,
which internally in Hugging Face defaults to `~/.cache/huggingface/datasets`.
Expand All @@ -40,6 +45,8 @@ def __init__(self, dataset: str, overwrite_existing: bool = True, cache_dir: str
"deepcell_output": "segmentation/deepcell_output",
"example_pixel_output_dir": "pixie/example_pixel_output_dir",
"example_cell_output_dir": "pixie/example_cell_output_dir",
"spatial_lda": "spatial_analysis/spatial_lda",
"post_clustering": "post_clustering",
}
"""
Path suffixes for mapping each downloaded dataset partition to it's appropriate
Expand Down Expand Up @@ -145,7 +152,15 @@ def get_example_dataset(dataset: str, save_dir: Union[str, pathlib.Path],
downloaded. Defaults to True.
"""

valid_datasets = ["segment_image_data", "cluster_pixels", "cluster_cells", "post_clustering"]
valid_datasets = ["segment_image_data",
"cluster_pixels",
"cluster_cells",
"post_clustering",
"fiber_segmentation",
"LDA_preprocessing",
"LDA_training_inference",
"neighborhood_analysis",
"pairwise_spatial_enrichment"]

# Check the appropriate dataset name
if dataset not in valid_datasets:
Expand Down
56 changes: 52 additions & 4 deletions ark/utils/example_dataset_test.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
import pathlib
from typing import Callable, Iterator, Generator
from typing import Callable, Generator, Iterator

import pytest
from ark.utils.example_dataset import ExampleDataset, get_example_dataset

from ark.utils import test_utils
from ark.utils.example_dataset import ExampleDataset, get_example_dataset


@pytest.fixture(scope="session", params=["segment_image_data", "cluster_pixels",
"cluster_cells", "post_clustering"])
@pytest.fixture(scope="session", params=["segment_image_data",
"cluster_pixels",
"cluster_cells",
"post_clustering",
"fiber_segmentation",
"LDA_preprocessing",
srivarra marked this conversation as resolved.
Show resolved Hide resolved
"LDA_training_inference",
"neighborhood_analysis",
"pairwise_spatial_enrichment"])
def dataset_download(request) -> Iterator[ExampleDataset]:
"""
A Fixture which instantiates and downloads the dataset with respect to each
Expand Down Expand Up @@ -71,21 +80,36 @@ def _setup(self):
"cell_masks": [f"fov{i}_cell_mask" for i in range(2)]
}

self._spatial_analysis_lda_preprocessed_files = [
"difference_mats",
"featurized_cell_table",
"formatted_cell_table",
"fov_stats",
"topic_eda"]

self._post_clustering_files = ["cell_table_thresholded",
"marker_thresholds", "updated_cell_table"]

self.dataset_test_fns: dict[str, Callable] = {
"image_data": self._image_data_check,
"cell_table": self._cell_table_check,
"deepcell_output": self._deepcell_output_check,
"example_pixel_output_dir": self._example_pixel_output_dir_check,
"example_cell_output_dir": self._example_cell_output_dir_check,
"spatial_lda": self._spatial_lda_output_dir_check,
"post_clustering": self._post_clustering_output_dir_check
}

# Mapping the datasets to their respective test functions.
# Should be the same as `example_dataset.ExampleDataset.path_suffixes`
self.move_path_suffixes = {
"image_data": "image_data",
"cell_table": "segmentation/cell_table",
"deepcell_output": "segmentation/deepcell_output",
"example_pixel_output_dir": "pixie/example_pixel_output_dir",
"example_cell_output_dir": "pixie/example_cell_output_dir",
"spatial_lda": "spatial_analysis/spatial_lda",
"post_clustering": "post_clustering",
}

def test_download_example_dataset(self, dataset_download: ExampleDataset):
Expand Down Expand Up @@ -347,6 +371,30 @@ def _example_cell_output_dir_check(self, dir_p: pathlib.Path):
assert set(self._example_cell_output_dir_names["cell_masks"]) \
== set(cell_mask_names)

def _spatial_lda_output_dir_check(self, dir_p: pathlib.Path):
"""
Checks to make sure that the correct files exist w.r.t the `spatial_lda` output dir
`spatial_analysis/spatial_lda/preprocessed`.

Args:
dir_p (pathlib.Path): The directory to check.
"""
downloaded_lda_preprocessed = list((dir_p / "preprocessed").glob("*.pkl"))
downloaded_lda_preprocessed_names = [f.stem for f in downloaded_lda_preprocessed]
assert set(self._spatial_analysis_lda_preprocessed_files) == set(
downloaded_lda_preprocessed_names)

def _post_clustering_output_dir_check(self, dir_p: pathlib.Path):
"""
Checks to make sure that the correct files exist w.r.t the `post_clustering` output dir

Args:
dir_p (pathlib.Path): The directory to check.
"""
downloaded_post_cluster = list(dir_p.glob("*.csv"))
downloaded_post_cluster_names = [f.stem for f in downloaded_post_cluster]
assert set(self._post_clustering_files) == set(downloaded_post_cluster_names)

def _suffix_paths(self, dataset_download: ExampleDataset,
parent_dir: pathlib.Path) -> Generator:
"""
Expand Down
1 change: 1 addition & 0 deletions ark/utils/io_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

def test_validate_paths():
# change cwd to /scripts for more accurate testing
pathlib.Path("data").mkdir(parents=True, exist_ok=True)
os.chdir('templates')

# make a tempdir for testing
Expand Down
6 changes: 3 additions & 3 deletions ark/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import xarray as xr

import ark.settings as settings
from ark.utils import synthetic_spatial_datagen, io_utils
from ark.utils import io_utils, synthetic_spatial_datagen
from ark.utils.tiff_utils import write_mibitiff


Expand Down Expand Up @@ -713,7 +713,7 @@ def _make_threshold_mat(in_utils):
a sample marker threshold matrix for thresholding specifically for channel enrichment
"""

thresh = pd.DataFrame(np.zeros((20, 2)))
thresh = pd.DataFrame(np.zeros((20, 2)), columns=["marker", "threshold"])
thresh.iloc[:, 1] = .5

if not in_utils:
Expand Down Expand Up @@ -1127,7 +1127,7 @@ def generate_sample_fov_tiling_entry(coord, name):
"aperture": "2",
"displayName": "Fine",
"defaults": {
"timingChoice": 7
"timingChoice": 7
}
},
"sectionId": 8201,
Expand Down
Binary file not shown.
Binary file not shown.
Loading