diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index e5727eb7..623d757b 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -16,7 +16,8 @@ jobs: with: fetch-depth: 0 # otherwise, you will failed to push refs to dest repo - name: install dacapo - run: pip install .[docs] + # run: pip install .[docs] + run: pip install sphinx-autodoc-typehints sphinx-autoapi sphinx-click sphinx-rtd-theme myst-parser - name: Build and Commit uses: sphinx-notes/pages@v2 with: diff --git a/.gitignore b/.gitignore index d14f3fb8..2cc4fcde 100644 --- a/.gitignore +++ b/.gitignore @@ -29,4 +29,7 @@ tmp/ # daisy logs daisy_logs/ -*.csv \ No newline at end of file +*.csv +*.private + +user_experiments/* diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..72bad0bd --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,62 @@ + +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Patton" + given-names: "William" + orcid: "https://orcid.org/0000-0002-9652-3222" +- family-names: "Rhoades" + given-names: "Jeff L." + orcid: "https://orcid.org/0000-0001-5077-2533" +- family-names: "Zouinkhi" + given-names: "Marwan" + orcid: "https://orcid.org/0000-0002-9441-2908" +- family-names: "Funke" + given-names: "Jan" + orcid: "http://orcid.org/0000-0003-4388-7783" +title: "DaCapo" +version: 0.3.0 +doi: 10.48550/arXiv.2408.02834 +date-released: 2024-08-05 +url: "https://github.com/janelia-cellmap/dacapo" +preferred-citation: + type: article + authors: + - family-names: "Patton" + given-names: "William" + orcid: "https://orcid.org/0000-0002-9652-3222" + - family-names: "Rhoades" + given-names: "Jeff L." + orcid: "https://orcid.org/0000-0001-5077-2533" + - family-names: "Zouinkhi" + given-names: "Marwan" + orcid: "https://orcid.org/0000-0002-9441-2908" + - family-names: "Ackerman" + given-names: "David G." + orcid: "http://orcid.org/0000-0003-0172-6594" + - family-names: "Malin-Mayor" + given-names: "Caroline" + orcid: "https://orcid.org/0000-0002-9627-6030" + - family-names: "Adjavon" + given-names: "Diane" + - family-names: "Heinrich" + given-names: "Larissa" + orcid: "http://orcid.org/0000-0003-2852-6664" + - family-names: "Bennett" + given-names: "Davis" + orcid: "http://orcid.org/0000-0001-7579-2848" + - family-names: "Zubov" + given-names: "Yurii" + orcid: "https://orcid.org/0000-0003-1988-8081" + - family-names: "Project Team" + given-names: "CellMap" + - family-names: "Weigel" + given-names: "Aubrey V." + orcid: "http://orcid.org/0000-0003-1694-4420" + - family-names: "Funke" + given-names: "Jan" + orcid: "http://orcid.org/0000-0003-4388-7783" + doi: 10.48550/arXiv.2408.02834 + journal: "arXiv-cs.CV" + title: "DaCapo: a modular deep learning framework for scalable 3D image segmentation" + year: 2024 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 892c7333..48a13b95 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,10 +22,10 @@ This will also be run automatically when a PR is made to master and a codecov re ## Branching and PRs -- Users that have been added to the CellMap organization and the DaCapo project should be able to develop directly into the CellMap fork of DaCapo. Other users will need to create a fork. -- For a completely new feature, make a branch off of the `dev/main` branch of CellMap's fork of DaCapo with a name describing the feature. If you are collaborating on a feature that already has a branch, you can branch off that feature branch. -- Currently, you should make your PRs into the `dev/main` branch of CellMap's fork, or the feature branch you branched off of. PRs currently require one maintainer's approval before merging. Once the PR is merged, the feature branch should be deleted. -- `dev/main` will be regularly merged to `main` when new features are fully implemented and all tests are passing. +- Users that have been added to the CellMap organization and the DaCapo project should be able to develop directly into the CellMap fork of DaCapo. Other users will need to create a fork or ask to be added as a collaborator. +- For a completely new feature, make a branch off of the `main` branch of CellMap's fork of DaCapo with a name describing the feature. If you are collaborating on a feature that already has a branch, you can branch off that feature branch. +- Currently, you should make your PRs into the `main` branch of CellMap's fork, or the feature branch you branched off of. PRs currently require one maintainer's approval before merging. Once the PR is merged, the feature branch will/should be deleted. +- `main` will be regularly published to PyPi when new features are fully implemented and all tests are passing. ## Documentation @@ -33,4 +33,4 @@ Documentation is built using Sphinx. To build the documentation locally, run ```bash sphinx-build -M html docs/source docs/build ``` -This will generate the html files in the `docs/build/html` directory. \ No newline at end of file +This will generate the html files in the `docs/build/html` directory. diff --git a/README.md b/README.md index 18de652a..dd37152a 100644 --- a/README.md +++ b/README.md @@ -72,4 +72,15 @@ Tasks we support and approaches for those tasks: - Example of [groundtruth data](https://tinyurl.com/pu8mespz) - Visualization - [Neuroglancer GitHub Repo](https://github.com/google/neuroglancer) - + +# Citing this repo +If you use our code, please cite us and spread the news! +``` +@article{Patton_DaCapo_a_modular_2024, +author = {Patton, William and Rhoades, Jeff L. and Zouinkhi, Marwan and Ackerman, David G. and Malin-Mayor, Caroline and Adjavon, Diane and Heinrich, Larissa and Bennett, Davis and Zubov, Yurii and Project Team, CellMap and Weigel, Aubrey V. and Funke, Jan}, +doi = {10.48550/arXiv.2408.02834}, +journal = {arXiv-cs.CV}, +title = {{DaCapo: a modular deep learning framework for scalable 3D image segmentation}}, +year = {2024} +} +``` diff --git a/dacapo/__init__.py b/dacapo/__init__.py index fcca5ce7..f54a1e06 100644 --- a/dacapo/__init__.py +++ b/dacapo/__init__.py @@ -5,6 +5,6 @@ from . import experiments, utils # noqa from .apply import apply # noqa from .train import train # noqa -from .validate import validate # noqa +from .validate import validate, validate_run # noqa from .predict import predict # noqa from .blockwise import run_blockwise, segment_blockwise # noqa diff --git a/dacapo/blockwise/__init__.py b/dacapo/blockwise/__init__.py index 6027a911..aa198e0d 100644 --- a/dacapo/blockwise/__init__.py +++ b/dacapo/blockwise/__init__.py @@ -1,2 +1,3 @@ from .blockwise_task import DaCapoBlockwiseTask from .scheduler import run_blockwise, segment_blockwise +from . import global_vars diff --git a/dacapo/blockwise/argmax_worker.py b/dacapo/blockwise/argmax_worker.py index d9f89345..2c15a162 100644 --- a/dacapo/blockwise/argmax_worker.py +++ b/dacapo/blockwise/argmax_worker.py @@ -54,6 +54,22 @@ def start_worker( output_container: Path | str, output_dataset: str, return_io_loop: bool = False, +): + return start_worker_fn( + input_container=input_container, + input_dataset=input_dataset, + output_container=output_container, + output_dataset=output_dataset, + return_io_loop=return_io_loop, + ) + + +def start_worker_fn( + input_container: Path | str, + input_dataset: str, + output_container: Path | str, + output_dataset: str, + return_io_loop: bool = False, ): """ Start the threshold worker. @@ -111,7 +127,7 @@ def spawn_worker( """ compute_context = create_compute_context() if not compute_context.distribute_workers: - return start_worker( + return start_worker_fn( input_array_identifier.container, input_array_identifier.dataset, output_array_identifier.container, diff --git a/dacapo/blockwise/empanada_function.py b/dacapo/blockwise/empanada_function.py index 301a282e..09871de8 100644 --- a/dacapo/blockwise/empanada_function.py +++ b/dacapo/blockwise/empanada_function.py @@ -50,7 +50,7 @@ def segment_function(input_array, block, **parameters): Args: input_array (np.ndarray): The 3D array to segment. - block (dask.array.core.Block): The block object. + block (daisy.Block): The block object. **parameters: Parameters for the empanada-napari segmenter. Returns: np.ndarray: The segmented 3D array. diff --git a/dacapo/blockwise/global_vars.py b/dacapo/blockwise/global_vars.py new file mode 100644 index 00000000..0c804e3f --- /dev/null +++ b/dacapo/blockwise/global_vars.py @@ -0,0 +1 @@ +current_run = None diff --git a/dacapo/blockwise/predict_worker.py b/dacapo/blockwise/predict_worker.py index c8b66673..4485cf80 100644 --- a/dacapo/blockwise/predict_worker.py +++ b/dacapo/blockwise/predict_worker.py @@ -17,6 +17,7 @@ import numpy as np import click +from dacapo.blockwise import global_vars import logging @@ -27,6 +28,20 @@ path = __file__ +def is_global_run_set(run_name) -> bool: + if global_vars.current_run is not None: + if global_vars.current_run.name == run_name: + return True + else: + logger.error( + f"Found global run {global_vars.current_run.name} but looking for {run_name}" + ) + return False + else: + logger.error("No global run is set.") + return False + + @click.group() @click.option( "--log-level", @@ -77,6 +92,26 @@ def start_worker( output_container: Path | str, output_dataset: str, return_io_loop: Optional[bool] = False, +): + return start_worker_fn( + run_name=run_name, + iteration=iteration, + input_container=input_container, + input_dataset=input_dataset, + output_container=output_container, + output_dataset=output_dataset, + return_io_loop=return_io_loop, + ) + + +def start_worker_fn( + run_name: str, + iteration: int | None, + input_container: Path | str, + input_dataset: str, + output_container: Path | str, + output_dataset: str, + return_io_loop: Optional[bool] = False, ): """ Start a worker to apply a trained model to a dataset. @@ -89,89 +124,97 @@ def start_worker( output_container (Path | str): The output container. output_dataset (str): The output dataset. """ - compute_context = create_compute_context() - device = compute_context.device - # retrieving run - config_store = create_config_store() - run_config = config_store.retrieve_run_config(run_name) - run = Run(run_config) + def io_loop(): + daisy_client = daisy.Client() - if iteration is not None: - # create weights store - weights_store = create_weights_store() + compute_context = create_compute_context() + device = compute_context.device + + if is_global_run_set(run_name): + logger.warning("Using global run variable") + run = global_vars.current_run + else: + logger.warning("initiating local run in predict_worker") + config_store = create_config_store() + run_config = config_store.retrieve_run_config(run_name) + run = Run(run_config) + + if iteration is not None and compute_context.distribute_workers: + # create weights store + weights_store = create_weights_store() + + # load weights + run.model.load_state_dict( + weights_store.retrieve_weights(run_name, iteration).model + ) - # load weights - run.model.load_state_dict( - weights_store.retrieve_weights(run_name, iteration).model + # get arrays + input_array_identifier = LocalArrayIdentifier( + Path(input_container), input_dataset ) + raw_array = ZarrArray.open_from_array_identifier(input_array_identifier) - # get arrays - input_array_identifier = LocalArrayIdentifier(Path(input_container), input_dataset) - raw_array = ZarrArray.open_from_array_identifier(input_array_identifier) - - output_array_identifier = LocalArrayIdentifier( - Path(output_container), output_dataset - ) - output_array = ZarrArray.open_from_array_identifier(output_array_identifier) - - # set benchmark flag to True for performance - torch.backends.cudnn.benchmark = True - - # get the model's input and output size - model = run.model.eval().to(device) - input_voxel_size = Coordinate(raw_array.voxel_size) - output_voxel_size = model.scale(input_voxel_size) - input_shape = Coordinate(model.eval_input_shape) - input_size = input_voxel_size * input_shape - output_size = output_voxel_size * model.compute_output_shape(input_shape)[1] - - print(f"Predicting with input size {input_size}, output size {output_size}") - - # create gunpowder keys - - raw = gp.ArrayKey("RAW") - prediction = gp.ArrayKey("PREDICTION") - - # assemble prediction pipeline - - # prepare data source - pipeline = DaCapoArraySource(raw_array, raw) - # raw: (c, d, h, w) - pipeline += gp.Pad(raw, None) - # raw: (c, d, h, w) - pipeline += gp.Unsqueeze([raw]) - # raw: (1, c, d, h, w) - - pipeline += gp.Normalize(raw) - - # predict - # model.eval() - pipeline += gp_torch.Predict( - model=model, - inputs={"x": raw}, - outputs={0: prediction}, - array_specs={ - prediction: gp.ArraySpec( - voxel_size=output_voxel_size, - dtype=np.float32, # assumes network output is float32 - ) - }, - spawn_subprocess=False, - device=str(device), - ) - - # make reference batch request - request = gp.BatchRequest() - request.add(raw, input_size, voxel_size=input_voxel_size) - request.add( - prediction, - output_size, - voxel_size=output_voxel_size, - ) + output_array_identifier = LocalArrayIdentifier( + Path(output_container), output_dataset + ) + output_array = ZarrArray.open_from_array_identifier(output_array_identifier) + + # set benchmark flag to True for performance + torch.backends.cudnn.benchmark = True + + # get the model's input and output size + model = run.model.eval() + # .to(device) + input_voxel_size = Coordinate(raw_array.voxel_size) + output_voxel_size = model.scale(input_voxel_size) + input_shape = Coordinate(model.eval_input_shape) + input_size = input_voxel_size * input_shape + output_size = output_voxel_size * model.compute_output_shape(input_shape)[1] + + print(f"Predicting with input size {input_size}, output size {output_size}") + + # create gunpowder keys + + raw = gp.ArrayKey("RAW") + prediction = gp.ArrayKey("PREDICTION") + + # assemble prediction pipeline + + # prepare data source + pipeline = DaCapoArraySource(raw_array, raw) + # raw: (c, d, h, w) + pipeline += gp.Pad(raw, None) + # raw: (c, d, h, w) + pipeline += gp.Unsqueeze([raw]) + # raw: (1, c, d, h, w) + + pipeline += gp.Normalize(raw) + + # predict + # model.eval() + pipeline += gp_torch.Predict( + model=model, + inputs={"x": raw}, + outputs={0: prediction}, + array_specs={ + prediction: gp.ArraySpec( + voxel_size=output_voxel_size, + dtype=np.float32, # assumes network output is float32 + ) + }, + spawn_subprocess=False, + device=str(device), + ) - def io_loop(): - daisy_client = daisy.Client() + # make reference batch request + request = gp.BatchRequest() + request.add(raw, input_size, voxel_size=input_voxel_size) + request.add( + prediction, + output_size, + voxel_size=output_voxel_size, + ) while True: with daisy_client.acquire_block() as block: @@ -224,14 +267,15 @@ def spawn_worker( Callable: The function to run the worker. """ compute_context = create_compute_context() + if not compute_context.distribute_workers: - return start_worker( - run_name, - iteration, - input_array_identifier.container, - input_array_identifier.dataset, - output_array_identifier.container, - output_array_identifier.dataset, + return start_worker_fn( + run_name=run_name, + iteration=iteration, + input_container=input_array_identifier.container, + input_dataset=input_array_identifier.dataset, + output_container=output_array_identifier.container, + output_dataset=output_array_identifier.dataset, return_io_loop=True, ) diff --git a/dacapo/blockwise/relabel_worker.py b/dacapo/blockwise/relabel_worker.py index 423a878f..bf4e185c 100644 --- a/dacapo/blockwise/relabel_worker.py +++ b/dacapo/blockwise/relabel_worker.py @@ -47,8 +47,20 @@ def start_worker( output_dataset, tmpdir, return_io_loop=False, - *args, - **kwargs, +): + return start_worker_fn( + output_container=output_container, + output_dataset=output_dataset, + tmpdir=tmpdir, + return_io_loop=return_io_loop, + ) + + +def start_worker_fn( + output_container, + output_dataset, + tmpdir, + return_io_loop=False, ): """ Start the relabel worker. @@ -145,8 +157,6 @@ def read_cross_block_merges(tmpdir): def spawn_worker( output_array_identifier: LocalArrayIdentifier, tmpdir: str, - *args, - **kwargs, ): """ Spawn a worker to predict on a given dataset. @@ -160,7 +170,7 @@ def spawn_worker( compute_context = create_compute_context() if not compute_context.distribute_workers: - return start_worker( + return start_worker_fn( output_array_identifier.container, output_array_identifier.dataset, tmpdir, diff --git a/dacapo/blockwise/segment_worker.py b/dacapo/blockwise/segment_worker.py index 0ecec2e1..2ccccf48 100644 --- a/dacapo/blockwise/segment_worker.py +++ b/dacapo/blockwise/segment_worker.py @@ -48,12 +48,32 @@ def cli(log_level): @click.option("--tmpdir", type=str, help="Temporary directory") @click.option("--function_path", type=str, help="Path to the segment function") def start_worker( - input_container: str, + input_container: str | Path, input_dataset: str, - output_container: str, + output_container: str | Path, output_dataset: str, - tmpdir: str, - function_path: str, + tmpdir: str | Path, + function_path: str | Path, + return_io_loop: bool = False, +): + return start_worker_fn( + input_container=input_container, + input_dataset=input_dataset, + output_container=output_container, + output_dataset=output_dataset, + tmpdir=tmpdir, + function_path=function_path, + return_io_loop=return_io_loop, + ) + + +def start_worker_fn( + input_container: str | Path, + input_dataset: str, + output_container: str | Path, + output_dataset: str, + tmpdir: str | Path, + function_path: str | Path, return_io_loop: bool = False, ): """ @@ -211,7 +231,7 @@ def spawn_worker( """ compute_context = create_compute_context() if not compute_context.distribute_workers: - return start_worker( + return start_worker_fn( input_array_identifier.container, input_array_identifier.dataset, output_array_identifier.container, diff --git a/dacapo/blockwise/threshold_worker.py b/dacapo/blockwise/threshold_worker.py index 3e05f13c..b6be79d2 100644 --- a/dacapo/blockwise/threshold_worker.py +++ b/dacapo/blockwise/threshold_worker.py @@ -50,6 +50,24 @@ def start_worker( output_dataset: str, threshold: float = 0.0, return_io_loop: bool = False, +): + return start_worker_fn( + input_container=input_container, + input_dataset=input_dataset, + output_container=output_container, + output_dataset=output_dataset, + threshold=threshold, + return_io_loop=return_io_loop, + ) + + +def start_worker_fn( + input_container: Path | str, + input_dataset: str, + output_container: Path | str, + output_dataset: str, + threshold: float = 0.0, + return_io_loop: bool = False, ): """ Start the threshold worker. @@ -109,7 +127,7 @@ def spawn_worker( """ compute_context = create_compute_context() if not compute_context.distribute_workers: - return start_worker( + return start_worker_fn( input_array_identifier.container, input_array_identifier.dataset, output_array_identifier.container, diff --git a/dacapo/cli.py b/dacapo/cli.py index f4a2a6c4..2af9aea7 100644 --- a/dacapo/cli.py +++ b/dacapo/cli.py @@ -2,7 +2,7 @@ from typing import Optional import numpy as np - +import yaml import dacapo import click import logging @@ -17,6 +17,8 @@ ) from dacapo.store.local_array_store import LocalArrayIdentifier from dacapo.experiments.datasplits.datasets.arrays import ZarrArray +from dacapo.options import DaCapoConfig +import os @click.group() @@ -92,8 +94,8 @@ def train(run_name): @click.option("-w", "--num_workers", type=int, default=30) @click.option("-dt", "--output_dtype", type=str, default="uint8") @click.option("-ow", "--overwrite", is_flag=True) -def validate(run_name, iteration): - dacapo.validate(run_name, iteration) +def validate(run_name, iteration, num_workers, output_dtype, overwrite): + dacapo.validate_run(run_name, iteration, num_workers, output_dtype, overwrite) @cli.command() @@ -686,6 +688,124 @@ def segment_blockwise( ) +def prompt_with_choices(prompt_text, choices, default_index=0): + """ + Prompts the user with a list of choices and returns the selected choice. + + Args: + prompt_text (str): The prompt text to display to the user. + choices (list): The list of choices to present. + default_index (int): The index of the default choice (0-based). + + Returns: + str: The selected choice. + """ + while True: + click.echo(prompt_text) + for i, choice in enumerate(choices, 1): + click.echo(f"{i} - {choice}") + + # If the default_index is out of range, set to 0 + default_index = max(0, min(default_index, len(choices) - 1)) + + try: + # Prompt the user for input + choice_num = click.prompt( + f"Enter your choice (default: {choices[default_index]})", + default=default_index + 1, + type=int, + ) + + # Check if the provided number is valid + if 1 <= choice_num <= len(choices): + return choices[choice_num - 1] + else: + click.echo("Invalid choice number. Please try again.") + except click.BadParameter: + click.echo("Invalid input. Please enter a number.") + + +@cli.command() +def config(): + if os.path.exists("dacapo.yaml"): + overwrite = click.confirm( + "dacapo.yaml already exists. Do you want to overwrite it?", default=False + ) + if not overwrite: + click.echo("Aborting configuration creation.") + return + runs_base_dir = click.prompt("Enter the base directory for runs", type=str) + storage_type = prompt_with_choices("Enter the type of storage:", ["files", "mongo"]) + mongo_db_name = None + mongo_db_host = None + if storage_type == "mongo": + mongo_db_name = click.prompt("Enter the name of the MongoDB database", type=str) + mongo_db_host = click.prompt("Enter the MongoDB host URI", type=str) + + compute_type = prompt_with_choices( + "Enter the type of compute context:", ["LocalTorch", "Bsub"] + ) + if compute_type == "Bsub": + queue = click.prompt("Enter the queue for compute context", type=str) + num_gpus = click.prompt("Enter the number of GPUs", type=int) + num_cpus = click.prompt("Enter the number of CPUs", type=int) + billing = click.prompt("Enter the billing account", type=str) + compute_context = { + "type": compute_type, + "config": { + "queue": queue, + "num_gpus": num_gpus, + "num_cpus": num_cpus, + "billing": billing, + }, + } + else: + compute_context = {"type": compute_type} + + try: + generate_config( + runs_base_dir, + storage_type, + compute_type, + compute_context, + mongo_db_name, + mongo_db_host, + ) + except ValueError as e: + logger.error(str(e)) + + +def generate_dacapo_yaml(config): + with open("dacapo.yaml", "w") as f: + yaml.dump(config.serialize(), f, default_flow_style=False) + print("dacapo.yaml has been created.") + + +def generate_config( + runs_base_dir, + storage_type, + compute_type, + compute_context, + mongo_db_name=None, + mongo_db_host=None, +): + config = DaCapoConfig( + type=storage_type, + runs_base_dir=Path(runs_base_dir).expanduser(), + compute_context=compute_context, + ) + + if storage_type == "mongo": + if not mongo_db_name or not mongo_db_host: + raise ValueError( + "--mongo_db_name and --mongo_db_host are required when type is 'mongo'" + ) + config.mongo_db_name = mongo_db_name + config.mongo_db_host = mongo_db_host + + generate_dacapo_yaml(config) + + def unpack_ctx(ctx): """ Unpacks the context object and returns a dictionary of keyword arguments. diff --git a/dacapo/compute_context/local_torch.py b/dacapo/compute_context/local_torch.py index 2cda4058..5a0371a4 100644 --- a/dacapo/compute_context/local_torch.py +++ b/dacapo/compute_context/local_torch.py @@ -23,6 +23,12 @@ class LocalTorch(ComputeContext): The class is a subclass of the ComputeContext class. """ + distribute_workers: Optional[bool] = attr.ib( + default=False, + metadata={ + "help_text": "Whether to distribute the workers across multiple nodes or processes." + }, + ) _device: Optional[str] = attr.ib( default=None, metadata={ diff --git a/dacapo/experiments/datasplits/datasets/arrays/__init__.py b/dacapo/experiments/datasplits/datasets/arrays/__init__.py index 63d6d6e2..74091aba 100644 --- a/dacapo/experiments/datasplits/datasets/arrays/__init__.py +++ b/dacapo/experiments/datasplits/datasets/arrays/__init__.py @@ -22,3 +22,4 @@ # nonconfigurable arrays (helpers) from .numpy_array import NumpyArray # noqa +from .constant_array_config import ConstantArray, ConstantArrayConfig # noqa diff --git a/dacapo/experiments/datasplits/datasets/arrays/concat_array.py b/dacapo/experiments/datasplits/datasets/arrays/concat_array.py index 2cea77a0..c2ef4096 100644 --- a/dacapo/experiments/datasplits/datasets/arrays/concat_array.py +++ b/dacapo/experiments/datasplits/datasets/arrays/concat_array.py @@ -459,3 +459,82 @@ def __getitem__(self, roi: Roi) -> np.ndarray: f"Concatenated array has only one channel: {self.name} {concatenated.shape}" ) return concatenated + + def _can_neuroglance(self): + """ + This method returns True if the source array can be visualized in neuroglance. + + Returns: + bool: True if the source array can be visualized in neuroglance. + Raises: + ValueError: If the source array is not writable. + Examples: + >>> binarize_array._can_neuroglance() + Note: + This method is used to return True if the source array can be visualized in neuroglance. + """ + return any( + [ + source_array._can_neuroglance() + for source_array in self.source_arrays.values() + ] + ) + + def _neuroglancer_source(self): + """ + This method returns the source array for neuroglancer. + + Returns: + neuroglancer.LocalVolume: The source array for neuroglancer. + Raises: + ValueError: If the source array is not writable. + Examples: + >>> binarize_array._neuroglancer_source() + Note: + This method is used to return the source array for neuroglancer. + """ + # return self._source_array._neuroglancer_source() + return [ + source_array._neuroglancer_source() + for source_array in self.source_arrays.values() + ] + + def _neuroglancer_layer(self): + """ + This method returns the neuroglancer layer for the source array. + + Returns: + neuroglancer.SegmentationLayer: The neuroglancer layer for the source array. + Raises: + ValueError: If the source array is not writable. + Examples: + >>> binarize_array._neuroglancer_layer() + Note: + This method is used to return the neuroglancer layer for the source array. + """ + # layer = neuroglancer.SegmentationLayer(source=self._neuroglancer_source()) + return [ + source_array._neuroglancer_layer() + for source_array in self.source_arrays.values() + if source_array._can_neuroglance() + ] + + def _source_name(self): + """ + This method returns the name of the source array. + + Returns: + str: The name of the source array. + Raises: + ValueError: If the source array is not writable. + Examples: + >>> binarize_array._source_name() + Note: + This method is used to return the name of the source array. + """ + # return self._source_array._source_name() + return [ + source_array._source_name() + for source_array in self.source_arrays.values() + if source_array._can_neuroglance() + ] diff --git a/dacapo/experiments/datasplits/datasets/arrays/constant_array.py b/dacapo/experiments/datasplits/datasets/arrays/constant_array.py new file mode 100644 index 00000000..411591b7 --- /dev/null +++ b/dacapo/experiments/datasplits/datasets/arrays/constant_array.py @@ -0,0 +1,493 @@ +from .array import Array + +from funlib.geometry import Roi + +import numpy as np +import neuroglancer + + +class ConstantArray(Array): + """ + This is a wrapper around another `source_array` that simply provides constant value + with the same metadata as the `source_array`. + + This is useful for creating a mask array that is the same size as the + original array, but with all values set to 1. + + Attributes: + source_array: The source array that this array is based on. + Methods: + like: Create a new ConstantArray with the same metadata as another array. + attrs: Get the attributes of the array. + axes: Get the axes of the array. + dims: Get the dimensions of the array. + voxel_size: Get the voxel size of the array. + roi: Get the region of interest of the array. + writable: Check if the array is writable. + data: Get the data of the array. + dtype: Get the data type of the array. + num_channels: Get the number of channels of the array. + __getitem__: Get a subarray of the array. + Note: + This class is not meant to be instantiated directly. Instead, use the + `like` method to create a new ConstantArray with the same metadata as + another array. + """ + + def __init__(self, array_config): + """ + Initialize the ConstantArray with the given array configuration. + + Args: + array_config: The configuration of the source array. + Raises: + RuntimeError: If the source array is not specified in the + configuration. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import ArrayConfig + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> from funlib.geometry import Roi + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> source_array_config = ArrayConfig(source_array) + >>> ones_array = ConstantArray(source_array_config) + >>> ones_array.source_array + NumpyArray(data=array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]]), voxel_size=(1.0, 1.0, 1.0), roi=Roi((0, 0, 0), (10, 10, 10)), num_channels=1) + Notes: + This class is not meant to be instantiated directly. Instead, use the + `like` method to create a new ConstantArray with the same metadata as + another array. + """ + self._source_array = array_config.source_array_config.array_type( + array_config.source_array_config + ) + self._constant = array_config.constant + + @classmethod + def like(cls, array: Array): + """ + Create a new ConstantArray with the same metadata as another array. + + Args: + array: The source array. + Returns: + The new ConstantArray with the same metadata as the source array. + Raises: + RuntimeError: If the source array is not specified. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray.like(source_array) + >>> ones_array.source_array + NumpyArray(data=array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]]), voxel_size=(1.0, 1.0, 1.0), roi=Roi((0, 0, 0), (10, 10, 10)), num_channels=1) + Notes: + This class is not meant to be instantiated directly. Instead, use the + `like` method to create a new ConstantArray with the same metadata as + another array. + + """ + instance = cls.__new__(cls) + instance._source_array = array + return instance + + @property + def attrs(self): + """ + Get the attributes of the array. + + Returns: + An empty dictionary. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray(source_array) + >>> ones_array.attrs + {} + Notes: + This method is used to get the attributes of the array. The attributes + are stored as key-value pairs in a dictionary. This method returns an + empty dictionary because the ConstantArray does not have any attributes. + """ + return dict() + + @property + def source_array(self) -> Array: + """ + Get the source array that this array is based on. + + Returns: + The source array. + Raises: + RuntimeError: If the source array is not specified. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray(source_array) + >>> ones_array.source_array + NumpyArray(data=array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]]), voxel_size=(1.0, 1.0, 1.0), roi=Roi((0, 0, 0), (10, 10, 10)), num_channels=1) + Notes: + This method is used to get the source array that this array is based on. + The source array is the array that the ConstantArray is created from. This + method returns the source array that was specified when the ConstantArray + was created. + """ + return self._source_array + + @property + def axes(self): + """ + Get the axes of the array. + + Returns: + The axes of the array. + Raises: + RuntimeError: If the axes are not specified. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray(source_array) + >>> ones_array.axes + 'zyx' + Notes: + This method is used to get the axes of the array. The axes are the + order of the dimensions of the array. This method returns the axes of + the array that was specified when the ConstantArray was created. + """ + return self.source_array.axes + + @property + def dims(self): + """ + Get the dimensions of the array. + + Returns: + The dimensions of the array. + Raises: + RuntimeError: If the dimensions are not specified. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray(source_array) + >>> ones_array.dims + (10, 10, 10) + Notes: + This method is used to get the dimensions of the array. The dimensions + are the size of the array along each axis. This method returns the + dimensions of the array that was specified when the ConstantArray was created. + """ + return self.source_array.dims + + @property + def voxel_size(self): + """ + Get the voxel size of the array. + + Returns: + The voxel size of the array. + Raises: + RuntimeError: If the voxel size is not specified. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray(source_array) + >>> ones_array.voxel_size + (1.0, 1.0, 1.0) + Notes: + This method is used to get the voxel size of the array. The voxel size + is the size of each voxel in the array. This method returns the voxel + size of the array that was specified when the ConstantArray was created. + """ + return self.source_array.voxel_size + + @property + def roi(self): + """ + Get the region of interest of the array. + + Returns: + The region of interest of the array. + Raises: + RuntimeError: If the region of interest is not specified. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> from funlib.geometry import Roi + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray(source_array) + >>> ones_array.roi + Roi((0, 0, 0), (10, 10, 10)) + Notes: + This method is used to get the region of interest of the array. The + region of interest is the region of the array that contains the data. + This method returns the region of interest of the array that was specified + when the ConstantArray was created. + """ + return self.source_array.roi + + @property + def writable(self) -> bool: + """ + Check if the array is writable. + + Returns: + False. + Raises: + RuntimeError: If the writability of the array is not specified. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray(source_array) + >>> ones_array.writable + False + Notes: + This method is used to check if the array is writable. An array is + writable if it can be modified in place. This method returns False + because the ConstantArray is read-only and cannot be modified. + """ + return False + + @property + def data(self): + """ + Get the data of the array. + + Returns: + The data of the array. + Raises: + RuntimeError: If the data is not specified. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray(source_array) + >>> ones_array.data + array([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]) + Notes: + This method is used to get the data of the array. The data is the + values that are stored in the array. This method returns a subarray + of the array with all values set to 1. + """ + raise RuntimeError("Cannot get writable version of this data!") + + @property + def dtype(self): + """ + Get the data type of the array. + + Returns: + The data type of the array. + Raises: + RuntimeError: If the data type is not specified. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray(source_array) + >>> ones_array.dtype + + Notes: + This method is used to get the data type of the array. The data type + is the type of the values that are stored in the array. This method + returns the data type of the array that was specified when the ConstantArray + was created. + """ + return bool + + @property + def num_channels(self): + """ + Get the number of channels of the array. + + Returns: + The number of channels of the array. + Raises: + RuntimeError: If the number of channels is not specified. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray(source_array) + >>> ones_array.num_channels + 1 + Notes: + This method is used to get the number of channels of the array. The + number of channels is the number of values that are stored at each + voxel in the array. This method returns the number of channels of the + array that was specified when the ConstantArray was created. + """ + return self.source_array.num_channels + + def __getitem__(self, roi: Roi) -> np.ndarray: + """ + Get a subarray of the array. + + Args: + roi: The region of interest. + Returns: + A subarray of the array with all values set to 1. + Examples: + >>> from dacapo.experiments.datasplits.datasets.arrays import ConstantArray + >>> from dacapo.experiments.datasplits.datasets.arrays import NumpyArray + >>> from funlib.geometry import Roi + >>> import numpy as np + >>> source_array = NumpyArray(np.zeros((10, 10, 10))) + >>> ones_array = ConstantArray(source_array) + >>> roi = Roi((0, 0, 0), (10, 10, 10)) + >>> ones_array[roi] + array([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]) + Notes: + This method is used to get a subarray of the array. The subarray is + specified by the region of interest. This method returns a subarray + of the array with all values set to 1. + """ + return ( + np.ones_like(self.source_array.__getitem__(roi), dtype=bool) + * self._constant + ) + + def _can_neuroglance(self): + """ + This method returns True if the source array can be visualized in neuroglance. + + Returns: + bool: True if the source array can be visualized in neuroglance. + Raises: + ValueError: If the source array is not writable. + Examples: + >>> binarize_array._can_neuroglance() + Note: + This method is used to return True if the source array can be visualized in neuroglance. + """ + return True + + def _neuroglancer_source(self): + """ + This method returns the source array for neuroglancer. + + Returns: + neuroglancer.LocalVolume: The source array for neuroglancer. + Raises: + ValueError: If the source array is not writable. + Examples: + >>> binarize_array._neuroglancer_source() + Note: + This method is used to return the source array for neuroglancer. + """ + # return self._source_array._neuroglancer_source() + shape = self.source_array[self.source_array.roi].shape + return np.ones(shape, dtype=np.uint64) * self._constant + + def _combined_neuroglancer_source(self) -> neuroglancer.LocalVolume: + """ + Combines dimensions and metadata from self._source_array._neuroglancer_source() + with data from self._neuroglancer_source(). + + Returns: + neuroglancer.LocalVolume: The combined neuroglancer source. + """ + source_array_volume = self._source_array._neuroglancer_source() + result_data = self._neuroglancer_source() + + return neuroglancer.LocalVolume( + data=result_data, + dimensions=source_array_volume.dimensions, + voxel_offset=source_array_volume.voxel_offset, + ) + + def _neuroglancer_layer(self): + """ + This method returns the neuroglancer layer for the source array. + + Returns: + neuroglancer.SegmentationLayer: The neuroglancer layer for the source array. + Raises: + ValueError: If the source array is not writable. + Examples: + >>> binarize_array._neuroglancer_layer() + Note: + This method is used to return the neuroglancer layer for the source array. + """ + # layer = neuroglancer.SegmentationLayer(source=self._neuroglancer_source()) + return neuroglancer.SegmentationLayer( + source=self._combined_neuroglancer_source() + ) + + def _source_name(self): + """ + This method returns the name of the source array. + + Returns: + str: The name of the source array. + Raises: + ValueError: If the source array is not writable. + Examples: + >>> binarize_array._source_name() + Note: + This method is used to return the name of the source array. + """ + # return self._source_array._source_name() + return f"{self._constant}_of_{self.source_array._source_name()}" diff --git a/dacapo/experiments/datasplits/datasets/arrays/constant_array_config.py b/dacapo/experiments/datasplits/datasets/arrays/constant_array_config.py new file mode 100644 index 00000000..47c2b868 --- /dev/null +++ b/dacapo/experiments/datasplits/datasets/arrays/constant_array_config.py @@ -0,0 +1,32 @@ +import attr + +from .array_config import ArrayConfig +from .constant_array import ConstantArray + + +@attr.s +class ConstantArrayConfig(ArrayConfig): + """ + This array read data from the source array and then return a np.ones_like() version. + + This is useful for creating a mask array from a source array. For example, if you have a + 2D array of data and you want to create a mask array that is the same shape as the data + array, you can use this class to create the mask array. + + Attributes: + source_array_config: The source array that you want to copy and fill with ones. + Methods: + create_array: Create the array. + Note: + This class is a subclass of ArrayConfig. + """ + + array_type = ConstantArray + + source_array_config: ArrayConfig = attr.ib( + metadata={"help_text": "The Array that you want to copy and fill with ones."} + ) + + constant: int = attr.ib( + metadata={"help_text": "The constant value to fill the array with."}, default=1 + ) diff --git a/dacapo/experiments/datasplits/datasets/arrays/logical_or_array.py b/dacapo/experiments/datasplits/datasets/arrays/logical_or_array.py index 212d933a..580f54d6 100644 --- a/dacapo/experiments/datasplits/datasets/arrays/logical_or_array.py +++ b/dacapo/experiments/datasplits/datasets/arrays/logical_or_array.py @@ -600,52 +600,50 @@ def _neuroglancer_source(self): The _neuroglancer_source method is used to get the neuroglancer source of the array. The neuroglancer source is the source that is displayed in the neuroglancer viewer. """ + # source_arrays + if hassattr(self._source_array, "source_arrays"): + source_arrays = list(self._source_array.source_arrays) + # apply logical or + mask = np.logical_or.reduce(source_arrays) + return mask return self._source_array._neuroglancer_source() + def _combined_neuroglancer_source(self) -> neuroglancer.LocalVolume: + """ + Combines dimensions and metadata from self._source_array._neuroglancer_source() + with data from self._neuroglancer_source(). + + Returns: + neuroglancer.LocalVolume: The combined neuroglancer source. + """ + source_array_volume = self._source_array._neuroglancer_source() + if isinstance(source_array_volume, list): + source_array_volume = source_array_volume[0] + result_data = self._neuroglancer_source() + + return neuroglancer.LocalVolume( + data=result_data, + dimensions=source_array_volume.dimensions, + voxel_offset=source_array_volume.voxel_offset, + ) + def _neuroglancer_layer(self): """ - Get the neuroglancer layer of the array + This method returns the neuroglancer layer for the source array. Returns: - Tuple[neuroglancer.Layer, dict]: The neuroglancer layer of the array + neuroglancer.SegmentationLayer: The neuroglancer layer for the source array. Raises: - ValueError: If the array is not writable + ValueError: If the source array is not writable. Examples: - >>> array_config = MergeInstancesArrayConfig( - ... name="logical_or", - ... source_array_configs=[ - ... ArrayConfig( - ... name="mask1", - ... array_type=MaskArray, - ... source_array_config=MaskArrayConfig( - ... name="mask1", - ... mask_id=1, - ... ), - ... ), - ... ArrayConfig( - ... name="mask2", - ... array_type=MaskArray, - ... source_array_config=MaskArrayConfig( - ... name="mask2", - ... mask_id=2, - ... ), - ... ), - ... ], - ... ) - >>> array = array_config.create_array() - >>> array._neuroglancer_layer() - (SegmentationLayer(source='precomputed://https://mybucket.storage.googleapis.com/path/to/logical_or'), {'visible': False}) - Notes: - The _neuroglancer_layer method is used to get the neuroglancer layer of the array. - The neuroglancer layer is the layer that is displayed in the neuroglancer viewer. + >>> binarize_array._neuroglancer_layer() + Note: + This method is used to return the neuroglancer layer for the source array. """ - # Generates an Segmentation layer - - layer = neuroglancer.SegmentationLayer(source=self._neuroglancer_source()) - kwargs = { - "visible": False, - } - return layer, kwargs + # layer = neuroglancer.SegmentationLayer(source=self._neuroglancer_source()) + return neuroglancer.SegmentationLayer( + source=self._combined_neuroglancer_source() + ) def _source_name(self): """ @@ -684,4 +682,7 @@ def _source_name(self): The _source_name method is used to get the name of the source array. The name of the source array is the name of the array that is being modified. """ - return self._source_array._source_name() + name = self._source_array._source_name() + if isinstance(name, list): + name = "_".join(name) + return "logical_or" + name diff --git a/dacapo/experiments/datasplits/datasets/arrays/ones_array.py b/dacapo/experiments/datasplits/datasets/arrays/ones_array.py index 6fd5c4fa..cf2c416f 100644 --- a/dacapo/experiments/datasplits/datasets/arrays/ones_array.py +++ b/dacapo/experiments/datasplits/datasets/arrays/ones_array.py @@ -4,6 +4,10 @@ import numpy as np +import logging + +logger = logging.getLogger(__name__) + class OnesArray(Array): """ @@ -67,6 +71,7 @@ def __init__(self, array_config): `like` method to create a new OnesArray with the same metadata as another array. """ + logger.warning("OnesArray is deprecated. Use ConstantArray instead.") self._source_array = array_config.source_array_config.array_type( array_config.source_array_config ) diff --git a/dacapo/experiments/datasplits/datasets/arrays/resampled_array.py b/dacapo/experiments/datasplits/datasets/arrays/resampled_array.py index 5c60a5df..ba6fd99f 100644 --- a/dacapo/experiments/datasplits/datasets/arrays/resampled_array.py +++ b/dacapo/experiments/datasplits/datasets/arrays/resampled_array.py @@ -236,10 +236,11 @@ def data(self): Note: This method returns the data of the resampled array. """ - raise ValueError( - "Cannot get a writable view of this array because it is a virtual " - "array created by modifying another array on demand." - ) + return self._source_array.data + # raise ValueError( + # "Cannot get a writable view of this array because it is a virtual " + # "array created by modifying another array on demand." + # ) @property def scale(self): diff --git a/dacapo/experiments/datasplits/datasets/arrays/zarr_array.py b/dacapo/experiments/datasplits/datasets/arrays/zarr_array.py index f61bf0cd..30c6ac69 100644 --- a/dacapo/experiments/datasplits/datasets/arrays/zarr_array.py +++ b/dacapo/experiments/datasplits/datasets/arrays/zarr_array.py @@ -369,11 +369,17 @@ def data(self) -> Any: """ file_name = str(self.file_name) # Zarr library does not detect the store for N5 datasets - if file_name.endswith(".n5"): - zarr_container = zarr.open(N5FSStore(str(file_name)), mode=self.mode) - else: - zarr_container = zarr.open(str(file_name), mode=self.mode) - return zarr_container[self.dataset] + try: + if file_name.endswith(".n5"): + zarr_container = zarr.open(N5FSStore(str(file_name)), mode=self.mode) + else: + zarr_container = zarr.open(str(file_name), mode=self.mode) + return zarr_container[self.dataset] + except Exception as e: + logger.error( + f"Could not open dataset {self.dataset} in file {file_name} in mode {self.mode}" + ) + raise e def __getitem__(self, roi: Roi) -> np.ndarray: """ diff --git a/dacapo/experiments/datasplits/datasets/dataset.py b/dacapo/experiments/datasplits/datasets/dataset.py index ced4f58d..d3591b44 100644 --- a/dacapo/experiments/datasplits/datasets/dataset.py +++ b/dacapo/experiments/datasplits/datasets/dataset.py @@ -137,16 +137,22 @@ def _neuroglancer_layers(self, prefix="", exclude_layers=None): and self.raw._source_name() not in exclude_layers ): layers[self.raw._source_name()] = self.raw._neuroglancer_layer() - if ( - self.gt is not None - and self.gt._can_neuroglance() - and self.gt._source_name() not in exclude_layers - ): - layers[self.gt._source_name()] = self.gt._neuroglancer_layer() - if ( - self.mask is not None - and self.mask._can_neuroglance() - and self.mask._source_name() not in exclude_layers - ): - layers[self.mask._source_name()] = self.mask._neuroglancer_layer() + if self.gt is not None and self.gt._can_neuroglance(): + new_layers = self.gt._neuroglancer_layer() + if isinstance(new_layers, list): + names = self.gt._source_name() + for name, layer in zip(names, new_layers): + if name not in exclude_layers: + layers[name] = layer + elif self.gt._source_name() not in exclude_layers: + layers[self.gt._source_name()] = new_layers + if self.mask is not None and self.mask._can_neuroglance(): + new_layers = self.mask._neuroglancer_layer() + if isinstance(new_layers, list): + names = self.mask._source_name() + for name, layer in zip(names, new_layers): + if name not in exclude_layers: + layers[name] = layer + elif self.gt._source_name() not in exclude_layers: + layers["mask_" + self.mask._source_name()] = new_layers return layers diff --git a/dacapo/experiments/datasplits/datasplit_generator.py b/dacapo/experiments/datasplits/datasplit_generator.py index a69fd633..bb1c1947 100644 --- a/dacapo/experiments/datasplits/datasplit_generator.py +++ b/dacapo/experiments/datasplits/datasplit_generator.py @@ -1,12 +1,12 @@ from dacapo.experiments.tasks import TaskConfig from upath import UPath as Path -from typing import List +from typing import List, Union, Optional, Sequence from enum import Enum, EnumMeta from funlib.geometry import Coordinate -from typing import Union, Optional import zarr from zarr.n5 import N5FSStore +import numpy as np from dacapo.experiments.datasplits.datasets.arrays import ( ZarrArrayConfig, ZarrArray, @@ -14,11 +14,15 @@ BinarizeArrayConfig, IntensitiesArrayConfig, ConcatArrayConfig, + LogicalOrArrayConfig, + ConstantArrayConfig, + CropArrayConfig, ) from dacapo.experiments.datasplits import TrainValidateDataSplitConfig from dacapo.experiments.datasplits.datasets import RawGTDatasetConfig import logging + logger = logging.getLogger(__name__) @@ -76,6 +80,10 @@ def resize_if_needed( raw_upsample = raw_voxel_size / target_resolution raw_downsample = target_resolution / raw_voxel_size + assert len(target_resolution) == zarr_array.dims, ( + f"Target resolution {target_resolution} and raw voxel size {raw_voxel_size} " + f"have different dimensions {zarr_array.dims}" + ) if any([u > 1 or d > 1 for u, d in zip(raw_upsample, raw_downsample)]): return ResampledArrayConfig( name=f"{extra_str}_{array_config.name}_{array_config.dataset}_resampled", @@ -88,6 +96,39 @@ def resize_if_needed( return array_config +def limit_validation_crop_size(gt_config, mask_config, max_size): + gt_array = gt_config.array_type(gt_config) + voxel_shape = gt_array.roi.shape / gt_array.voxel_size + crop = False + while np.prod(voxel_shape) > max_size: + crop = True + max_idx = np.argmax(voxel_shape) + voxel_shape = Coordinate( + s if i != max_idx else s // 2 for i, s in enumerate(voxel_shape) + ) + if crop: + crop_roi_shape = voxel_shape * gt_array.voxel_size + context = (gt_array.roi.shape - crop_roi_shape) / 2 + crop_roi = gt_array.roi.grow(-context, -context) + crop_roi = crop_roi.snap_to_grid(gt_array.voxel_size, mode="shrink") + + logger.debug( + f"Cropped {gt_config.name}: original roi: {gt_array.roi}, new_roi: {crop_roi}" + ) + + gt_config = CropArrayConfig( + name=gt_config.name + "_cropped", + source_array_config=gt_config, + roi=crop_roi, + ) + mask_config = CropArrayConfig( + name=mask_config.name + "_cropped", + source_array_config=gt_config, + roi=crop_roi, + ) + return gt_config, mask_config + + def get_right_resolution_array_config( container: Path, dataset, target_resolution, extra_str="" ): @@ -389,12 +430,14 @@ def generate_dataspec_from_csv(csv_path: Path): class DataSplitGenerator: - """ - Generates DataSplitConfig for a given task config and datasets. A csv file can be generated - from the DataSplitConfig and used to generate the DataSplitConfig again. + """Generates DataSplitConfig for a given task config and datasets. + + Class names in gt_dataset should be within [] e.g. [mito&peroxisome&er] for + multiple classes or [mito] for one class. - Currently only supports semantic segmentation. - Supports: + Currently only supports: + - semantic segmentation. + Supports: - 2D and 3D datasets. - Zarr, N5 and OME-Zarr datasets. - Multi class targets. @@ -432,10 +475,14 @@ class DataSplitGenerator: The minimum raw value. raw_max : int The maximum raw value. - classes_separator_caracter : str + classes_separator_character : str The classes separator character. + max_validation_volume_size : int + The maximum validation volume size. Default is None. If None, the validation volume size is not limited. + else, the validation volume size is limited to the specified value. + e.g. 600**3 for 600^3 voxels = 216_000_000 voxels. Methods: - __init__(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_caracter) + __init__(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_character) Initializes the DataSplitGenerator class with the specified name, datasets, input resolution, output resolution, targets, segmentation type, maximum ground truth downsample, maximum ground truth upsample, maximum raw training downsample, maximum raw training upsample, maximum raw validation downsample, maximum raw validation upsample, minimum training volume size, minimum raw value, maximum raw value, and classes separator character. __str__(self) A method to get the string representation of the class. @@ -462,8 +509,8 @@ def __init__( self, name: str, datasets: List[DatasetSpec], - input_resolution: Coordinate, - output_resolution: Coordinate, + input_resolution: Union[Sequence[int], Coordinate], + output_resolution: Union[Sequence[int], Coordinate], targets: Optional[List[str]] = None, segmentation_type: Union[str, SegmentationType] = "semantic", max_gt_downsample=32, @@ -475,7 +522,10 @@ def __init__( min_training_volume_size=8_000, # 20**3 raw_min=0, raw_max=255, - classes_separator_caracter="&", + classes_separator_character="&", + use_negative_class=False, + max_validation_volume_size=None, + binarize_gt=False, ): """ Initializes the DataSplitGenerator class with the specified: @@ -495,6 +545,8 @@ def __init__( - minimum raw value - maximum raw value - classes separator character + - use negative class + - binarize ground truth Args: name : str @@ -527,29 +579,36 @@ def __init__( The minimum raw value. raw_max : int The maximum raw value. - classes_separator_caracter : str + classes_separator_character : str The classes separator character. + use_negative_class : bool + Whether to use negative classes. + binarize_gt : bool + Whether to binarize the ground truth as part of preprocessing. Use this if you are doing semantic segmentation on instance labels (where each object has a unique ID). Returns: obj : The DataSplitGenerator class. Raises: ValueError If the class name is already set, a ValueError is raised. Examples: - >>> DataSplitGenerator(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_caracter) + >>> DataSplitGenerator(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_character) Notes: This function is used to initialize the DataSplitGenerator class with the specified name, datasets, input resolution, output resolution, targets, segmentation type, maximum ground truth downsample, maximum ground truth upsample, maximum raw training downsample, maximum raw training upsample, maximum raw validation downsample, maximum raw validation upsample, minimum training volume size, minimum raw value, maximum raw value, and classes separator character. """ + if not isinstance(input_resolution, Coordinate): + input_resolution = Coordinate(input_resolution) + if not isinstance(output_resolution, Coordinate): + output_resolution = Coordinate(output_resolution) + if isinstance(segmentation_type, str): + segmentation_type = SegmentationType[segmentation_type.lower()] + self.name = name self.datasets = datasets self.input_resolution = input_resolution self.output_resolution = output_resolution self.targets = targets self._class_name = None - - if isinstance(segmentation_type, str): - segmentation_type = SegmentationType[segmentation_type.lower()] - self.segmentation_type = segmentation_type self.max_gt_downsample = max_gt_downsample self.max_gt_upsample = max_gt_upsample @@ -560,7 +619,15 @@ def __init__( self.min_training_volume_size = min_training_volume_size self.raw_min = raw_min self.raw_max = raw_max - self.classes_separator_caracter = classes_separator_caracter + self.classes_separator_character = classes_separator_character + self.use_negative_class = use_negative_class + self.max_validation_volume_size = max_validation_volume_size + self.binarize_gt = binarize_gt + if use_negative_class: + if targets is None: + raise ValueError( + "use_negative_class=True requires targets to be specified." + ) def __str__(self) -> str: """ @@ -599,6 +666,11 @@ def class_name(self): Notes: This function is used to get the class name. """ + if self._class_name is None: + if self.targets is None: + logger.warning("Both targets and class name are None.") + return None + self._class_name = self.targets return self._class_name # Goal is to force class_name to be set only once, so we have the same classes for all datasets @@ -649,7 +721,7 @@ def check_class_name(self, class_name): """ datasets, classes = format_class_name( - class_name, self.classes_separator_caracter + class_name, self.classes_separator_character, self.targets ) if self.class_name is None: self.class_name = classes @@ -708,27 +780,38 @@ def __generate_semantic_seg_datasplit(self): train_dataset_configs = [] validation_dataset_configs = [] for dataset in self.datasets: - raw_config, gt_config = self.__generate_semantic_seg_dataset_crop(dataset) + ( + raw_config, + gt_config, + mask_config, + ) = self.__generate_semantic_seg_dataset_crop(dataset) + if type(self.class_name) == list: + classes = self.classes_separator_character.join(self.class_name) + else: + classes = self.class_name if dataset.dataset_type == DatasetType.train: train_dataset_configs.append( RawGTDatasetConfig( - name=f"{dataset}_{self.class_name}_{self.output_resolution[0]}nm", + name=f"{dataset}_{gt_config.name}_{classes}_{self.output_resolution[0]}nm", raw_config=raw_config, gt_config=gt_config, + mask_config=mask_config, ) ) else: + if self.max_validation_volume_size is not None: + gt_config, mask_config = limit_validation_crop_size( + gt_config, mask_config, self.max_validation_volume_size + ) validation_dataset_configs.append( RawGTDatasetConfig( - name=f"{dataset}_{self.class_name}_{self.output_resolution[0]}nm", + name=f"{dataset}_{gt_config.name}_{classes}_{self.output_resolution[0]}nm", raw_config=raw_config, gt_config=gt_config, + mask_config=mask_config, ) ) - if type(self.class_name) == list: - classes = self.classes_separator_caracter.join(self.class_name) - else: - classes = self.class_name + return TrainValidateDataSplitConfig( name=f"{self.name}_{self.segmentation_type}_{classes}_{self.output_resolution[0]}nm", train_configs=train_dataset_configs, @@ -790,7 +873,10 @@ def __generate_semantic_seg_dataset_crop(self, dataset: DatasetSpec): max=self.raw_max, ) organelle_arrays = {} - classes_datasets, classes = self.check_class_name(gt_dataset) + # classes_datasets, classes = self.check_class_name(gt_dataset) + classes_datasets, classes = format_class_name( + gt_dataset, self.classes_separator_character, self.targets + ) for current_class_dataset, current_class_name in zip(classes_datasets, classes): if not (gt_path / current_class_dataset).exists(): raise FileNotFoundError( @@ -811,26 +897,87 @@ def __generate_semantic_seg_dataset_crop(self, dataset: DatasetSpec): self.output_resolution, "gt", ) - gt_config = BinarizeArrayConfig( - f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_binarized", - source_array_config=gt_config, - groupings=[(current_class_name, [])], - ) + if self.binarize_gt: + gt_config = BinarizeArrayConfig( + f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_binarized", + source_array_config=gt_config, + groupings=[(current_class_name, [])], + ) organelle_arrays[current_class_name] = gt_config + if self.targets is None: targets_str = "_".join(classes) current_targets = classes else: current_targets = self.targets targets_str = "_".join(self.targets) - if len(organelle_arrays) > 1: + + target_images = {} + target_masks = {} + + missing_classes = [c for c in current_targets if c not in classes] + found_classes = [c for c in current_targets if c in classes] + for t in found_classes: + target_images[t] = organelle_arrays[t] + + if len(missing_classes) > 0: + if not self.use_negative_class: + raise ValueError( + f"Missing classes found, {str(missing_classes)}, please specify use_negative_class=True to generate the missing classes." + ) + else: + if len(organelle_arrays) == 0: + raise ValueError( + f"No target classes found, please specify targets to generate the negative classes." + ) + # generate negative class + if len(organelle_arrays) > 1: + found_gt_config = ConcatArrayConfig( + name=f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_gt", + channels=list(organelle_arrays.keys()), + source_array_configs=organelle_arrays, + ) + missing_mask_config = LogicalOrArrayConfig( + name=f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_labelled_voxels", + source_array_config=found_gt_config, + ) + else: + missing_mask_config = list(organelle_arrays.values())[0] + missing_gt_config = ConstantArrayConfig( + name=f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_gt", + source_array_config=list(organelle_arrays.values())[0], + constant=0, + ) + for t in missing_classes: + target_images[t] = missing_gt_config + target_masks[t] = missing_mask_config + + for t in found_classes: + target_masks[t] = ConstantArrayConfig( + name=f"{dataset}_{t}_{self.output_resolution[0]}nm_labelled_voxels", + source_array_config=target_images[t], + constant=1, + ) + + if len(target_images) > 1: gt_config = ConcatArrayConfig( name=f"{dataset}_{targets_str}_{self.output_resolution[0]}nm_gt", channels=[organelle for organelle in current_targets], - source_array_configs={k: gt for k, gt in organelle_arrays.items()}, + # source_array_configs={k: gt for k, gt in target_images.items()}, + source_array_configs={k: target_images[k] for k in current_targets}, ) + mask_config = ConcatArrayConfig( + name=f"{dataset}_{targets_str}_{self.output_resolution[0]}nm_mask", + channels=[organelle for organelle in current_targets], + # source_array_configs={k: mask for k, mask in target_masks.items()}, + # to be sure to have the same order + source_array_configs={k: target_masks[k] for k in current_targets}, + ) + else: + gt_config = list(target_images.values())[0] + mask_config = list(target_masks.values())[0] - return raw_config, gt_config + return raw_config, gt_config, mask_config # @staticmethod # def generate_csv(datasets: List[DatasetSpec], csv_path: Path): @@ -844,8 +991,8 @@ def __generate_semantic_seg_dataset_crop(self, dataset: DatasetSpec): @staticmethod def generate_from_csv( csv_path: Path, - input_resolution: Coordinate, - output_resolution: Coordinate, + input_resolution: Union[Sequence[int], Coordinate], + output_resolution: Union[Sequence[int], Coordinate], name: Optional[str] = None, **kwargs, ): @@ -889,7 +1036,7 @@ def generate_from_csv( ) -def format_class_name(class_name, separator_character="&"): +def format_class_name(class_name, separator_character="&", targets=None): """ Format the class name. @@ -915,4 +1062,8 @@ def format_class_name(class_name, separator_character="&"): base_class_name = class_name.split("[")[0] return [f"{base_class_name}{c}" for c in classes], classes else: - raise ValueError(f"Invalid class name {class_name} missing '[' and ']'") + if targets is None: + raise ValueError(f"Invalid class name {class_name} missing '[' and ']'") + if len(targets) > 1: + raise ValueError(f"Invalid class name {class_name} missing '[' and ']'") + return [class_name], [targets[0]] diff --git a/dacapo/experiments/run.py b/dacapo/experiments/run.py index e7afb03a..8361e47b 100644 --- a/dacapo/experiments/run.py +++ b/dacapo/experiments/run.py @@ -1,11 +1,12 @@ from .datasplits import DataSplit from .tasks.task import Task from .architectures import Architecture -from .trainers import Trainer +from .trainers import Trainer, GunpowderTrainer from .training_stats import TrainingStats from .validation_scores import ValidationScores from .starts import Start from .model import Model +from typing import Optional import torch @@ -48,13 +49,13 @@ class Run: task: Task architecture: Architecture trainer: Trainer - datasplit: DataSplit + _datasplit: Optional[DataSplit] model: Model optimizer: torch.optim.Optimizer training_stats: TrainingStats - validation_scores: ValidationScores + _validation_scores: Optional[ValidationScores] def __init__(self, run_config, load_starter_model: bool = True): """ @@ -211,3 +212,30 @@ def move_optimizer( state[k] = v.to(device) if empty_cuda_cache: torch.cuda.empty_cache() + + def __str__(self): + return self.name + + def visualize_pipeline(self): + """ + Visualizes the pipeline for the run, including all produced arrays. + + Examples: + >>> run.visualize_pipeline() + + """ + if not isinstance(self.trainer, GunpowderTrainer): + raise NotImplementedError( + "Only GunpowderTrainer is supported for visualization" + ) + if not hasattr(self.trainer, "_pipeline"): + from ..store.create_store import create_array_store + + array_store = create_array_store() + self.trainer.build_batch_provider( + self.datasplit.train, + self.model, + self.task, + array_store.snapshot_container(self.name), + ) + self.trainer.visualize_pipeline() diff --git a/dacapo/experiments/tasks/affinities_task.py b/dacapo/experiments/tasks/affinities_task.py index a355288d..1a5b4d7c 100644 --- a/dacapo/experiments/tasks/affinities_task.py +++ b/dacapo/experiments/tasks/affinities_task.py @@ -25,14 +25,10 @@ class AffinitiesTask(Task): def __init__(self, task_config): """ - Create a `DummyTask` from a `DummyTaskConfig`. + Create an AffinitiesTask object from a given AffinitiesTaskConfig. Args: - task_config: The configuration for the task. - Returns: - A `DummyTask` object. - Raises: - NotImplementedError: This method is not implemented. + task_config (AffinitiesTaskConfig): The configuration for the affinities task Examples: >>> task = AffinitiesTask(task_config) """ @@ -40,6 +36,8 @@ def __init__(self, task_config): self.predictor = AffinitiesPredictor( neighborhood=task_config.neighborhood, lsds=task_config.lsds, + num_voxels=task_config.num_lsd_voxels, + downsample_lsds=task_config.downsample_lsds, affs_weight_clipmin=task_config.affs_weight_clipmin, affs_weight_clipmax=task_config.affs_weight_clipmax, lsd_weight_clipmin=task_config.lsd_weight_clipmin, diff --git a/dacapo/experiments/tasks/affinities_task_config.py b/dacapo/experiments/tasks/affinities_task_config.py index 5e22f2a0..0aeb7276 100644 --- a/dacapo/experiments/tasks/affinities_task_config.py +++ b/dacapo/experiments/tasks/affinities_task_config.py @@ -17,6 +17,8 @@ class AffinitiesTaskConfig(TaskConfig): Attributes: neighborhood: A list of Coordinate objects. lsds: Whether or not to train lsds along with your affinities. + num_lsd_voxels: The number of voxels to use for the lsd center of mass calculation. + downsample_lsds: The factor by which to downsample the lsds. lsds_to_affs_weight_ratio: If training with lsds, set how much they should be weighted compared to affs. affs_weight_clipmin: The minimum value for affinities weights. affs_weight_clipmax: The maximum value for affinities weights. @@ -45,6 +47,19 @@ class AffinitiesTaskConfig(TaskConfig): "It has been shown that lsds as an auxiliary task can help affinity predictions." }, ) + num_lsd_voxels: int = attr.ib( + default=10, + metadata={ + "help_text": "The number of voxels to use for the lsd center of mass calculation." + }, + ) + downsample_lsds: int = attr.ib( + default=1, + metadata={ + "help_text": "The factor by which to downsample the lsds. " + "This can be useful to reduce the computational cost of training." + }, + ) lsds_to_affs_weight_ratio: float = attr.ib( default=1, metadata={ diff --git a/dacapo/experiments/tasks/post_processors/threshold_post_processor.py b/dacapo/experiments/tasks/post_processors/threshold_post_processor.py index c0e10418..f99c64d3 100644 --- a/dacapo/experiments/tasks/post_processors/threshold_post_processor.py +++ b/dacapo/experiments/tasks/post_processors/threshold_post_processor.py @@ -68,7 +68,7 @@ def process( self, parameters: "ThresholdPostProcessorParameters", # type: ignore[override] output_array_identifier: "LocalArrayIdentifier", - num_workers: int = 16, + num_workers: int = 12, block_size: Coordinate = Coordinate((256, 256, 256)), ) -> ZarrArray: """ @@ -122,7 +122,7 @@ def process( read_roi = Roi((0, 0, 0), write_size[-self.prediction_array.dims :]) # run blockwise post-processing - run_blockwise( + sucess = run_blockwise( worker_file=str( Path(Path(dacapo.blockwise.__file__).parent, "threshold_worker.py") ), @@ -138,4 +138,7 @@ def process( threshold=parameters.threshold, ) + if not sucess: + raise RuntimeError("Blockwise post-processing failed.") + return output_array diff --git a/dacapo/experiments/tasks/predictors/affinities_predictor.py b/dacapo/experiments/tasks/predictors/affinities_predictor.py index 59f0cfa6..e4084270 100644 --- a/dacapo/experiments/tasks/predictors/affinities_predictor.py +++ b/dacapo/experiments/tasks/predictors/affinities_predictor.py @@ -4,22 +4,6 @@ from dacapo.experiments.datasplits.datasets.arrays import NumpyArray from dacapo.utils.affinities import seg_to_affgraph, padding as aff_padding from dacapo.utils.balance_weights import balance_weights - -from funlib.geometry import Coordinate -from lsd.train import LsdExtractor - -from scipy import ndimage -import numpy as np -import torch -import itertools - -from typing import List -from .predictor import Predictor -from dacapo.experiments import Model -from dacapo.experiments.arraytypes import EmbeddingArray -from dacapo.experiments.datasplits.datasets.arrays import NumpyArray -from dacapo.utils.affinities import seg_to_affgraph, padding as aff_padding -from dacapo.utils.balance_weights import balance_weights from funlib.geometry import Coordinate from lsd.train import LsdExtractor from scipy import ndimage diff --git a/dacapo/experiments/tasks/predictors/hot_distance_predictor.py b/dacapo/experiments/tasks/predictors/hot_distance_predictor.py index c25df23e..9b067f23 100644 --- a/dacapo/experiments/tasks/predictors/hot_distance_predictor.py +++ b/dacapo/experiments/tasks/predictors/hot_distance_predictor.py @@ -188,7 +188,7 @@ def create_weight(self, gt, target, mask, moving_class_counts=None): self.dt_scale_factor, ) else: - distance_mask = np.ones_like(target.data) + distance_mask = np.ones_like(gt.data) distance_weights, distance_moving_class_counts = balance_weights( gt[target.roi], diff --git a/dacapo/experiments/trainers/gunpowder_trainer.py b/dacapo/experiments/trainers/gunpowder_trainer.py index 4916e557..6b9ecf20 100644 --- a/dacapo/experiments/trainers/gunpowder_trainer.py +++ b/dacapo/experiments/trainers/gunpowder_trainer.py @@ -489,3 +489,90 @@ def can_train(self, datasets) -> bool: """ return all([dataset.gt is not None for dataset in datasets]) + + def visualize_pipeline(self): + if self._pipeline is None: + raise ValueError("Pipeline not initialized!") + + import neuroglancer + + # self.iteration = 0 + + pipeline = self._pipeline.children[0].children[0].copy() + if self.num_data_fetchers > 1: + pipeline = pipeline.children[0] + + pipeline += gp.Stack(1) + + request = self._request + # raise Exception(request) + + def batch_generator(): + with gp.build(pipeline): + while True: + yield pipeline.request_batch(request) + + batch_gen = batch_generator() + + def load_batch(event): + print("fetching_batch") + batch = next(batch_gen) + + with viewer.txn() as s: + while len(s.layers) > 0: + del s.layers[0] + + # reverse order for raw so we can set opacity to 1, this + # way higher res raw replaces low res when available + for name, array in batch.arrays.items(): + print(name) + data = array.data[0] + + channel_dims = len(data.shape) - len(array.spec.voxel_size) + assert channel_dims <= 1 + + dims = neuroglancer.CoordinateSpace( + names=["c^", "z", "y", "x"][-len(data.shape) :], + units="nm", + scales=tuple([1] * channel_dims) + tuple(array.spec.voxel_size), + ) + + local_vol = neuroglancer.LocalVolume( + data=data, + voxel_offset=tuple([0] * channel_dims) + + tuple((-array.spec.roi.shape / 2) / array.spec.voxel_size), + dimensions=dims, + ) + + if name == self._gt_key: + s.layers[str(name)] = neuroglancer.SegmentationLayer( + source=local_vol + ) + else: + s.layers[str(name)] = neuroglancer.ImageLayer(source=local_vol) + + s.layout = neuroglancer.row_layout( + [ + neuroglancer.column_layout( + [ + neuroglancer.LayerGroupViewer( + layers=[str(k) for k, v in batch.items()] + ), + ] + ) + ] + ) + + neuroglancer.set_server_bind_address("0.0.0.0") + + viewer = neuroglancer.Viewer() + + viewer.actions.add("load_batch", load_batch) + + with viewer.config_state.txn() as s: + s.input_event_bindings.data_view["keyt"] = "load_batch" + + print(viewer) + load_batch(None) + + input("Enter to quit!") diff --git a/dacapo/experiments/training_stats.py b/dacapo/experiments/training_stats.py index eef5f2c9..1b84a742 100644 --- a/dacapo/experiments/training_stats.py +++ b/dacapo/experiments/training_stats.py @@ -5,6 +5,9 @@ from typing import List import attr +import logging + +logger = logging.getLogger(__name__) @attr.s @@ -62,9 +65,11 @@ def add_iteration_stats(self, iteration_stats: TrainingIterationStats) -> None: - The inner list contains the stats for each training iteration. """ if len(self.iteration_stats) > 0: - assert ( - iteration_stats.iteration == self.iteration_stats[-1].iteration + 1 - ), f"Expected iteration {self.iteration_stats[-1].iteration + 1}, got {iteration_stats.iteration}" + if iteration_stats.iteration <= self.iteration_stats[-1].iteration: + logger.error( + f"Expected iteration {self.iteration_stats[-1].iteration + 1}, got {iteration_stats.iteration}. will remove stats after {iteration_stats.iteration-1}" + ) + self.delete_after(iteration_stats.iteration - 1) self.iteration_stats.append(iteration_stats) diff --git a/dacapo/options.py b/dacapo/options.py index 589d3d7c..e52a0c57 100644 --- a/dacapo/options.py +++ b/dacapo/options.py @@ -73,7 +73,8 @@ def serialize(self): {'type': 'files', 'runs_base_dir': '/home/user/dacapo', 'compute_context': {'type': 'LocalTorch', 'config': {}}, 'mongo_db_host': None, 'mongo_db_name': None} """ converter = Converter() - return converter.unstructure(self) + data = converter.unstructure(self) + return {k: v for k, v in data.items() if v is not None} class Options: @@ -146,6 +147,7 @@ def config_file(cls) -> Optional[Path]: ] for path in options_files: if path.exists(): + os.environ["OPTIONS_FILE"] = str(path) return path return None diff --git a/dacapo/plot.py b/dacapo/plot.py index e86f697b..d5bfe1d2 100644 --- a/dacapo/plot.py +++ b/dacapo/plot.py @@ -7,10 +7,16 @@ import bokeh.layouts import bokeh.plotting import numpy as np +from tqdm import tqdm from collections import namedtuple import itertools from typing import List +import matplotlib.pyplot as plt + + +import os + RunInfo = namedtuple( "RunInfo", @@ -104,7 +110,7 @@ def get_runs_info( run_config.trainer_config.name, run_config.datasplit_config.name, ( - stats_store.retrieve_training_stats(run_config_name, subsample=True) + stats_store.retrieve_training_stats(run_config_name) if plot_loss else None ), @@ -117,7 +123,7 @@ def get_runs_info( return runs -def plot_runs( +def bokeh_plot_runs( run_config_base_names, smooth=100, validation_scores=None, @@ -159,7 +165,7 @@ def plot_runs( tools="pan, wheel_zoom, reset, save, hover", x_axis_label="iterations", tooltips=loss_tooltips, - plot_width=2048, + # plot_width=2048, ) loss_figure.background_fill_color = "#efefef" @@ -202,7 +208,7 @@ def plot_runs( tools="pan, wheel_zoom, reset, save, hover", x_axis_label="iterations", tooltips=validation_tooltips, - plot_width=2048, + # plot_width=2048, ) validation_figure.background_fill_color = "#efefef" validation_figures[dataset.name] = validation_figure @@ -226,7 +232,7 @@ def plot_runs( x_axis_label="model size", y_axis_label="best validation", tooltips=summary_tooltips, - plot_width=2048, + # plot_width=2048, ) summary_figure.background_fill_color = "#efefef" @@ -297,24 +303,24 @@ def plot_runs( "run": [run.name] * len(x), } # TODO: get_best: higher_is_better is not true for all scores - best_parameters, best_scores = run.validation_scores.get_best( - dataset_data, dim="parameters" - ) - - source_dict.update( - { - name: np.array( - [ - getattr(best_parameter, name) - for best_parameter in best_parameters.values - ] - ) - for name in run.validation_scores.parameter_names - } - ) - source_dict.update( - {run.validation_score_name: np.array(best_scores.values)} - ) + # best_parameters, best_scores = run.validation_scores.get_best( + # dataset_data, dim="parameters" + # ) + + # source_dict.update( + # { + # name: np.array( + # [ + # getattr(best_parameter, name) + # for best_parameter in best_parameters.values + # ] + # ) + # for name in run.validation_scores.parameter_names + # } + # ) + # source_dict.update( + # {run.validation_score_name: np.array(best_scores.values)} + # ) source = bokeh.plotting.ColumnDataSource(source_dict) validation_figures[dataset.name].line( @@ -384,3 +390,92 @@ def plot_runs( else: bokeh.plotting.output_file("performance_plots.html") bokeh.plotting.save(plot) + + +def plot_runs( + run_config_base_names, + smooth=100, + validation_scores=None, + higher_is_betters=None, + plot_losses=None, +): + """ + Plot runs. + Args: + run_config_base_names: Names of run configs to plot + smooth: Smoothing factor + validation_scores: Validation scores to plot + higher_is_betters: Whether higher is better + plot_losses: Whether to plot losses + Returns: + None + """ + print("PLOTTING RUNS") + runs = get_runs_info(run_config_base_names, validation_scores, plot_losses) + print("GOT RUNS INFO") + + colors = itertools.cycle(plt.cm.tab20.colors) + include_validation_figure = False + include_loss_figure = False + + fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(15, 10)) + loss_ax = axes[0] + validation_ax = axes[1] + + for run, color in zip(runs, colors): + name = run.name + + if run.plot_loss: + iterations = [stat.iteration for stat in run.training_stats.iteration_stats] + losses = [stat.loss for stat in run.training_stats.iteration_stats] + + print(f"Run {run.name} has {len(losses)} iterations") + + if run.plot_loss: + include_loss_figure = True + smooth = int(np.maximum(len(iterations) / 2500, 1)) + print(f"smoothing: {smooth}") + x, _ = smooth_values(iterations, smooth, stride=smooth) + y, s = smooth_values(losses, smooth, stride=smooth) + print(x, y) + print(f"plotting {(len(x), len(y))} points") + loss_ax.plot(x, y, label=name, color=color) + print("LOSS PLOTTED") + + if run.validation_score_name and run.validation_scores.validated_until() > 0: + validation_score_data = run.validation_scores.to_xarray().sel( + criteria=run.validation_score_name + ) + colors_val = itertools.cycle(plt.cm.tab20.colors) + for dataset, color_v in zip(run.validation_scores.datasets, colors_val): + dataset_data = validation_score_data.sel(datasets=dataset) + include_validation_figure = True + x = [score.iteration for score in run.validation_scores.scores] + cc = next(colors_val) + for i in range(dataset_data.data.shape[1]): + current_name = ( + f"{i}_{dataset.name}_{name}_{run.validation_score_name}" + ) + validation_ax.plot( + x, + dataset_data.data[:, i], + label=current_name, + color=cc, + alpha=0.5 + 0.2 * i, + ) + print("VALIDATION PLOTTED") + + if include_loss_figure: + loss_ax.set_title("Training") + loss_ax.set_xlabel("Iterations") + loss_ax.set_ylabel("Loss") + loss_ax.legend() + + if include_validation_figure: + validation_ax.set_title("Validation") + validation_ax.set_xlabel("Iterations") + validation_ax.set_ylabel("Validation Score") + validation_ax.legend() + + plt.tight_layout() + plt.show() diff --git a/dacapo/predict.py b/dacapo/predict.py index 0c09a9f7..f28e9766 100644 --- a/dacapo/predict.py +++ b/dacapo/predict.py @@ -1,5 +1,5 @@ from upath import UPath as Path - +from dacapo.blockwise import global_vars from dacapo.blockwise import run_blockwise import dacapo.blockwise from dacapo.experiments import Run @@ -24,7 +24,7 @@ def predict( input_dataset: str, output_path: LocalArrayIdentifier | Path | str, output_roi: Optional[Roi | str] = None, - num_workers: int = 12, + num_workers: int = 1, output_dtype: np.dtype | str = np.uint8, # type: ignore overwrite: bool = True, ): @@ -72,14 +72,13 @@ def predict( output_container, f"prediction_{run_name}_{iteration}" ) - # get the model's input and output size compute_context = create_compute_context() if isinstance(compute_context, LocalTorch): num_workers = 1 model = run.model.eval() - if iteration is not None: + if iteration is not None and not compute_context.distribute_workers: # create weights store weights_store = create_weights_store() @@ -94,7 +93,7 @@ def predict( input_size = input_voxel_size * input_shape output_size = output_voxel_size * model.compute_output_shape(input_shape)[1] num_out_channels = model.num_out_channels - del model + # del model # calculate input and output rois @@ -137,10 +136,12 @@ def predict( write_size=output_size, ) + global_vars.current_run = run + # run blockwise prediction worker_file = str(Path(Path(dacapo.blockwise.__file__).parent, "predict_worker.py")) print("Running blockwise prediction with worker_file: ", worker_file) - run_blockwise( + success = run_blockwise( worker_file=worker_file, total_roi=_input_roi, read_roi=Roi((0, 0, 0), input_size), @@ -149,9 +150,10 @@ def predict( max_retries=2, # TODO: make this an option timeout=None, # TODO: make this an option ###### - run_name=run_name, + run_name=run.name, iteration=iteration, input_array_identifier=input_array_identifier, output_array_identifier=output_array_identifier, ) print("Done predicting.") + return success diff --git a/dacapo/store/converter.py b/dacapo/store/converter.py index 62bb2f4d..7e5451b3 100644 --- a/dacapo/store/converter.py +++ b/dacapo/store/converter.py @@ -121,10 +121,11 @@ class from unstructured data. cls = cls_fn(obj_data["__type__"]) structure_fn = make_dict_structure_fn(cls, self) return structure_fn(obj_data, cls) - except: + except Exception as e: print( f"Could not structure object of type {obj_data}. will try unstructured data. attr __type__ can be missing because of old version of the data." ) + print(e) return obj_data diff --git a/dacapo/store/file_stats_store.py b/dacapo/store/file_stats_store.py index 72cf9df5..2efdd51a 100644 --- a/dacapo/store/file_stats_store.py +++ b/dacapo/store/file_stats_store.py @@ -87,6 +87,7 @@ def store_training_stats(self, run_name, stats): ) else: # current stats are behind DB--drop DB + existing_stats = None logger.warning( f"Overwriting previous training stats for run {run_name}" ) @@ -94,7 +95,7 @@ def store_training_stats(self, run_name, stats): # store all new stats self.__store_training_stats( - stats, store_from_iteration, stats.trained_until(), run_name + existing_stats, stats, store_from_iteration, stats.trained_until(), run_name ) def retrieve_training_stats(self, run_name): @@ -174,11 +175,12 @@ def delete_training_stats(self, run_name: str) -> None: """ self.__delete_training_stats(run_name) - def __store_training_stats(self, stats, begin, end, run_name): + def __store_training_stats(self, existing_stats, stats, begin, end, run_name): """ Store the training statistics for a specific run. Args: + existing_stats (Stats): The statistics object containing the training stats that are already stored. stats (Stats): The statistics object containing the training stats. begin (int): The starting index of the iteration stats to store. end (int): The ending index of the iteration stats to store. @@ -190,10 +192,15 @@ def __store_training_stats(self, stats, begin, end, run_name): """ docs = converter.unstructure(stats.iteration_stats[begin:end]) - for doc in docs: - doc.update({"run_name": run_name}) if docs: + if existing_stats: + # prepend existing stats to new stats + docs = converter.unstructure(existing_stats.iteration_stats) + docs + + for doc in docs: + doc.update({"run_name": run_name}) + file_store = self.training_stats / run_name with file_store.open("wb") as fd: pickle.dump(docs, fd) diff --git a/dacapo/store/local_weights_store.py b/dacapo/store/local_weights_store.py index fb375602..94190628 100644 --- a/dacapo/store/local_weights_store.py +++ b/dacapo/store/local_weights_store.py @@ -142,7 +142,9 @@ def retrieve_weights(self, run: str, iteration: int) -> Weights: weights_name = self.__get_weights_dir(run) / "iterations" / str(iteration) - weights: Weights = torch.load(weights_name, map_location="cpu") + weights: Weights = torch.load( + weights_name, map_location="cpu", weights_only=False + ) if not isinstance(weights, Weights): # backwards compatibility weights = Weights(weights["model"], weights["optimizer"]) diff --git a/dacapo/train.py b/dacapo/train.py index f218b325..a1d884b3 100644 --- a/dacapo/train.py +++ b/dacapo/train.py @@ -47,7 +47,7 @@ def train(run_name: str): return train_run(run) -def train_run(run: Run): +def train_run(run: Run, do_validate=True): """ Train a run @@ -57,7 +57,7 @@ def train_run(run: Run): ValueError: If run_name is not found in config store """ - print(f"Starting/resuming training for run {run}...") + print(f"Starting/resuming training for run {run.name}...") # create run @@ -184,28 +184,31 @@ def train_run(run: Run): stats_store.store_training_stats(run.name, run.training_stats) weights_store.store_weights(run, iteration_stats.iteration + 1) - try: - # launch validation in a separate thread to avoid blocking training - validate_thread = threading.Thread( - target=validate, - args=(run, iteration_stats.iteration + 1), - name=f"validate_{run.name}_{iteration_stats.iteration + 1}", - daemon=True, - ) - validate_thread.start() - # validate( - # run, - # iteration_stats.iteration + 1, - # ) - - stats_store.store_validation_iteration_scores( - run.name, run.validation_scores - ) - except Exception as e: - logger.error( - f"Validation failed for run {run.name} at iteration " - f"{iteration_stats.iteration + 1}.", - exc_info=e, - ) + if do_validate: + try: + # launch validation in a separate thread to avoid blocking training + if compute_context.distribute_workers: + validate_thread = threading.Thread( + target=validate, + args=(run, iteration_stats.iteration + 1), + name=f"validate_{run.name}_{iteration_stats.iteration + 1}", + daemon=True, + ) + validate_thread.start() + else: + validate( + run, + iteration_stats.iteration + 1, + ) + + stats_store.store_validation_iteration_scores( + run.name, run.validation_scores + ) + except Exception as e: + logger.error( + f"Validation failed for run {run.name} at iteration " + f"{iteration_stats.iteration + 1}.", + exc_info=e, + ) print(f"Trained until {trained_until}. Finished.") diff --git a/dacapo/validate.py b/dacapo/validate.py index b49ffe4c..44f091e7 100644 --- a/dacapo/validate.py +++ b/dacapo/validate.py @@ -1,3 +1,4 @@ +from dacapo.compute_context import create_compute_context from .predict import predict from .experiments import Run, ValidationIterationScores from .experiments.datasplits.datasets.arrays import ZarrArray @@ -16,21 +17,41 @@ def validate_run( - run: Run, + run_name: str, iteration: int, num_workers: int = 1, output_dtype: str = "uint8", overwrite: bool = True, ): """ - validate_run is deprecated and will be removed in a future version. Please use validate instead. + Validate a run at a given iteration. Loads the weights from a previously + stored checkpoint. Returns the best parameters and scores for this + iteration. + + Args: + run: The name of the run to validate. + iteration: The iteration to validate. + num_workers: The number of workers to use for validation. + output_dtype: The dtype to use for the output arrays. + overwrite: Whether to overwrite existing output arrays + """ - warn( - "validate_run is deprecated and will be removed in a future version. Please use validate instead.", - DeprecationWarning, - ) + # Load the model and weights + config_store = create_config_store() + run_config = config_store.retrieve_run_config(run_name) + run = Run(run_config) + compute_context = create_compute_context() + if iteration is not None and not compute_context.distribute_workers: + # create weights store + weights_store = create_weights_store() + + # load weights + run.model.load_state_dict( + weights_store.retrieve_weights(run_name, iteration).model + ) + return validate( - run_name=run, + run=run, iteration=iteration, num_workers=num_workers, output_dtype=output_dtype, @@ -39,7 +60,7 @@ def validate_run( def validate( - run_name: str | Run, + run: Run, iteration: int, num_workers: int = 1, output_dtype: str = "uint8", @@ -51,7 +72,7 @@ def validate( iteration. Args: - run_name: The name of the run to validate. + run: The run to validate. iteration: The iteration to validate. num_workers: The number of workers to use for validation. output_dtype: The dtype to use for the output arrays. @@ -61,18 +82,12 @@ def validate( Raises: ValueError: If the run does not have a validation dataset or the dataset does not have ground truth. Example: - validate("my_run", 1000) + validate(my_run, 1000) """ - print(f"Validating run {run_name} at iteration {iteration}...") + print(f"Validating run {run.name} at iteration {iteration}...") - if isinstance(run_name, Run): - run = run_name - run_name = run.name - else: - config_store = create_config_store() - run_config = config_store.retrieve_run_config(run_name) - run = Run(run_config) + run_name = run.name # read in previous training/validation stats stats_store = create_stats_store() @@ -170,9 +185,10 @@ def validate( prediction_array_identifier = array_store.validation_prediction_array( run.name, iteration, validation_dataset.name ) - predict( + compute_context = create_compute_context() + success = predict( run, - iteration, + iteration if compute_context.distribute_workers else None, input_container=input_raw_array_identifier.container, input_dataset=input_raw_array_identifier.dataset, output_path=prediction_array_identifier, @@ -182,6 +198,12 @@ def validate( overwrite=overwrite, ) + if not success: + logger.error( + f"Could not predict run {run.name} on dataset {validation_dataset.name}." + ) + continue + print(f"Predicted on dataset {validation_dataset.name}") post_processor.set_prediction(prediction_array_identifier) diff --git a/docs/source/aws.rst b/docs/source/aws.rst new file mode 100644 index 00000000..518f3bb6 --- /dev/null +++ b/docs/source/aws.rst @@ -0,0 +1,76 @@ +.. automodule:: dacapo + +.. contents:: + :depth: 1 + :local: + +AWS EC2 Setup Guide +=================== + +This guide will help you to run your Docker image on an AWS EC2 instance and set up S3 access for storing data. + +Running Docker Image on AWS EC2 +------------------------------- + +To run your Docker image on an AWS EC2 instance, follow these steps: + +1. **Create Key Pair** (if you don't have one already): + +.. code-block:: bash + + aws ec2 create-key-pair --key-name MyKeyPair --query 'KeyMaterial' --output text > MyKeyPair.pem + chmod 400 MyKeyPair.pem + +2. **Create a Security Group** (if you don't have one already): + +.. code-block:: bash + + aws ec2 create-security-group --group-name my-security-group --description "My security group" + + +3. **Authorize Inbound Traffic for the Security Group**: + +.. code-block:: bash + aws ec2 authorize-security-group-ingress --group-name my-security-group --protocol tcp --port 22 --cidr 0.0.0.0/0 + aws ec2 authorize-security-group-ingress --group-name my-security-group --protocol tcp --port 80 --cidr 0.0.0.0/0 + + +4. **Run EC2 Instance with Docker Image**: + + Use the following script to launch an EC2 instance, pull your Docker image from DockerHub, and run it with port forwarding: + +.. code-block:: bash + + aws ec2 run-instances --image-id ami-0b8956f13d7bdfe7b --count 1 --instance-type p3.2xlarge --key-name --security-groups --user-data '#!/bin/bash + yum update -y + amazon-linux-extras install docker -y + service docker start + docker pull mzouink/dacapo:0.3.0 + docker run -d -p 80:8000 mzouink/dacapo:0.3.0' + + +Replace `` with the name of your key pair and `` with the name of your security group. + +S3 Access Configuration +----------------------- + +You can work locally using S3 data by setting the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables. You can also set the `AWS_REGION` environment variable to specify the region to use. If you are using a profile, you can set the `AWS_PROFILE` environment variable to specify the profile to use. + +To configure AWS CLI, use the following command: + +.. code-block:: bash + + aws configure + + +Storing Checkpoints and Experiments Data in S3 +---------------------------------------------- + +To store checkpoints and experiments data in S3, modify the `dacapo.yaml` file to include the following: + +.. code-block:: yaml + + runs_base_dir: "s3://dacapotest" + + +This setup will help you run your Docker image on AWS EC2 and configure S3 access for storing checkpoints and experiment data. diff --git a/docs/source/index.rst b/docs/source/index.rst index a4390edd..c35cb2df 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -11,6 +11,7 @@ install tutorial docker + aws autoapi/index cli diff --git a/docs/source/install.rst b/docs/source/install.rst index b90937e2..988cbce1 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -6,6 +6,11 @@ Installation .. automodule:: gunpowder :noindex: +For the most recent release: +.. code-block:: bash + + pip install dacapo-ml + For the latest version: .. code-block:: bash diff --git a/examples/blockwise/segment_config.yaml b/examples/blockwise/segment_config.yaml new file mode 100644 index 00000000..80e0ff12 --- /dev/null +++ b/examples/blockwise/segment_config.yaml @@ -0,0 +1,7 @@ + +steps: + gaussian_smooth: + sigma: 2.0 + threshold: + threshold: 200 + label: {} diff --git a/examples/blockwise/segment_function.py b/examples/blockwise/segment_function.py new file mode 100644 index 00000000..c49239c4 --- /dev/null +++ b/examples/blockwise/segment_function.py @@ -0,0 +1,38 @@ +import logging + +import scipy.ndimage +import yaml + +logger = logging.getLogger(__file__) + + +def segment_function(input_array, block, config_path): + """ + Segment a 3D block using a small numpy-based post-processing script. + + Args: + input_array (np.ndarray): The 3D array to segment. + block (daisy.Block): The block object. + config_path (str): The path to the configuration yaml file. + Returns: + np.ndarray: The segmented 3D array. + """ + data = input_array.to_ndarray(block.read_roi) + steps = yaml.load(config_path, Loader=yaml.FullLoader) + + # Apply the segmentation function here + for step, params in steps.items(): + if step == "gaussian_smooth": + sigma = params.get("sigma", 1.0) + logger.info(f"Applying Gaussian smoothing with sigma={sigma}") + data = scipy.ndimage.gaussian_filter(data, sigma=sigma) + elif step == "threshold": + threshold = params.get("threshold", 0.5) + logger.info(f"Applying thresholding with threshold={threshold}") + data = data > threshold + elif step == "label": + structuring_element = params.get("structuring_element", None) + logger.info("Applying labeling") + data, _ = scipy.ndimage.label(data, structuring_element) # type: ignore + + return data diff --git a/examples/blockwise/segment_script.sh b/examples/blockwise/segment_script.sh new file mode 100644 index 00000000..ec4f191f --- /dev/null +++ b/examples/blockwise/segment_script.sh @@ -0,0 +1,25 @@ +# This file contains the commands used to run an example of the segment-blockwise command. +# The arguments to the segment-blockwise command are: +# -sf: path to the python file with the segmentation_function (in this case the empanada_function.py file) +# -tr: Total ROI to process. It is a list of 3 elements, each one is a list with the start and end of the ROI in the x, y and z axis respectively. +# e.g. -tr "[320000:330000, 100000:110000, 10000:20000]" \ +# -rr: ROI to read. It is a list of 3 elements, each one is a list with the start and end of the ROI in the x, y and z axis respectively. +# -wr: ROI to write. It is a list of 3 elements, each one is a list with the start and end of the ROI in the x, y and z axis respectively. +# -nw: Number of workers to use. +# -ic: Input container. It is the path to the input zarr file. +# -id: Input dataset. It is the path to the input dataset inside the input zarr file. +# -oc: Output container. It is the path to the output zarr file. +# -od: Output dataset. It is the path to the output dataset inside the output zarr file. +# --config_path: Path to the config yaml file. + + +dacapo segment-blockwise \ +-sf segment_function.py \ +-rr "[256,256,256]" \ +-wr "[256,256,256]" \ +-nw 32 \ +-ic predictions/c-elegen/bw/c_elegen_bw_op50_ld_scratch_0_300000.zarr \ +-id ld/ld \ +-oc predictions/c-elegen/bw/jrc_c-elegans-bw-1_postprocessed.zarr \ +-od ld \ +--config_path segment_config.yaml diff --git a/examples/distance_task/synthetic_example.py b/examples/distance_task/synthetic_example.py index 210109ee..262573ff 100644 --- a/examples/distance_task/synthetic_example.py +++ b/examples/distance_task/synthetic_example.py @@ -157,12 +157,23 @@ from dacapo.experiments.datasplits import DataSplitGenerator from funlib.geometry import Coordinate +csv_path = Path(runs_base_dir, "synthetic_example.csv") +if not csv_path.exists(): + # Create a csv file with the paths to the zarr files + with open(csv_path, "w") as f: + f.write( + f"train,{train_data_path},raw,{train_data_path},[labels]\n" + f"val,{validate_data_path},raw,{validate_data_path},[labels]\n" + # f"test,{test_data_path},raw,{test_data_path},[labels]\n" + ) + input_resolution = Coordinate(8, 8, 8) output_resolution = Coordinate(8, 8, 8) datasplit_config = DataSplitGenerator.generate_from_csv( - "/misc/public/dacapo_learnathon/datasplit_csvs/synthetic_example.csv", + csv_path, input_resolution, output_resolution, + binarize_gt=True, # Binarize the ground truth data to convert from instance segmentation to semantic segmentation ).compute() datasplit = datasplit_config.datasplit_type(datasplit_config) @@ -390,6 +401,20 @@ config_store = create_config_store() run = Run(config_store.retrieve_run_config(run_config.name)) +# First visualize all the steps in the data preprocessing pipeline +from dacapo.store.create_store import create_array_store + +array_store = create_array_store() +run.trainer.build_batch_provider( + run.datasplit.train, + run.model, + run.task, + array_store.snapshot_container(run.name), +) +run.trainer.visualize_pipeline() + +# %% Now let's train! + # Visualize as we go run_viewer = NeuroglancerRunViewer(run) run_viewer.start() diff --git a/pyproject.toml b/pyproject.toml index 0ab64cdf..ade6bb47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "lazy-property", "neuroglancer", "torch", - "fibsem_tools", + "fibsem_tools<=6.3", "attrs", "bokeh", "numpy-indexed>=0.3.7", @@ -47,7 +47,7 @@ dependencies = [ "funlib.geometry>=0.2", "mwatershed>=0.1", "cellmap-models", - "funlib.persistence>=0.3.0", + "funlib.persistence==0.4.0", "gunpowder>=1.3", "lsds", "xarray", @@ -63,7 +63,7 @@ dependencies = [ # extras # https://peps.python.org/pep-0621/#dependencies-optional-dependencies [project.optional-dependencies] -test = ["pytest==7.4.4", "pytest-cov", "pytest-lazy-fixture"] +test = ["pytest", "pytest-cov", "pytest-lazy-fixtures"] dev = [ "black", "mypy", diff --git a/tests/components/test_arrays.py b/tests/components/test_arrays.py index d62dcb97..d91863ad 100644 --- a/tests/components/test_arrays.py +++ b/tests/components/test_arrays.py @@ -3,15 +3,15 @@ from dacapo.store.create_store import create_config_store import pytest -from pytest_lazyfixture import lazy_fixture +from pytest_lazy_fixtures import lf @pytest.mark.parametrize( "array_config", [ - lazy_fixture("cellmap_array"), - lazy_fixture("zarr_array"), - lazy_fixture("dummy_array"), + lf("cellmap_array"), + lf("zarr_array"), + lf("dummy_array"), ], ) def test_array_api(options, array_config): diff --git a/tests/components/test_gp_arraysource.py b/tests/components/test_gp_arraysource.py index 7ff62603..69fee515 100644 --- a/tests/components/test_gp_arraysource.py +++ b/tests/components/test_gp_arraysource.py @@ -5,15 +5,15 @@ import gunpowder as gp import pytest -from pytest_lazyfixture import lazy_fixture +from pytest_lazy_fixtures import lf @pytest.mark.parametrize( "array_config", [ - lazy_fixture("cellmap_array"), - lazy_fixture("zarr_array"), - lazy_fixture("dummy_array"), + lf("cellmap_array"), + lf("zarr_array"), + lf("dummy_array"), ], ) def test_gp_dacapo_array_source(array_config): diff --git a/tests/components/test_trainers.py b/tests/components/test_trainers.py index 172a89b7..f3f9b07a 100644 --- a/tests/components/test_trainers.py +++ b/tests/components/test_trainers.py @@ -3,14 +3,14 @@ from dacapo.store.create_store import create_config_store import pytest -from pytest_lazyfixture import lazy_fixture +from pytest_lazy_fixtures import lf @pytest.mark.parametrize( "trainer_config", [ - lazy_fixture("dummy_trainer"), - lazy_fixture("gunpowder_trainer"), + lf("dummy_trainer"), + lf("gunpowder_trainer"), ], ) def test_trainer( diff --git a/tests/operations/test_apply.py b/tests/operations/test_apply.py index 5ce608e1..02bbd47b 100644 --- a/tests/operations/test_apply.py +++ b/tests/operations/test_apply.py @@ -8,7 +8,7 @@ from dacapo import apply import pytest -from pytest_lazyfixture import lazy_fixture +from pytest_lazy_fixtures import lf import logging @@ -18,9 +18,9 @@ @pytest.mark.parametrize( "run_config", [ - # lazy_fixture("distance_run"), - lazy_fixture("dummy_run"), - # lazy_fixture("onehot_run"), + # lf("distance_run"), + lf("dummy_run"), + # lf("onehot_run"), ], ) def test_apply(options, run_config, zarr_array, tmp_path): diff --git a/tests/operations/test_predict.py b/tests/operations/test_predict.py index cd8f6a6c..4a4b5347 100644 --- a/tests/operations/test_predict.py +++ b/tests/operations/test_predict.py @@ -8,7 +8,7 @@ from dacapo import predict import pytest -from pytest_lazyfixture import lazy_fixture +from pytest_lazy_fixtures import lf import logging @@ -18,9 +18,9 @@ @pytest.mark.parametrize( "run_config", [ - # lazy_fixture("distance_run"), - lazy_fixture("dummy_run"), - # lazy_fixture("onehot_run"), + # lf("distance_run"), + lf("dummy_run"), + # lf("onehot_run"), ], ) def test_predict(options, run_config, zarr_array, tmp_path): diff --git a/tests/operations/test_train.py b/tests/operations/test_train.py index d36655ea..a852101b 100644 --- a/tests/operations/test_train.py +++ b/tests/operations/test_train.py @@ -7,7 +7,7 @@ from dacapo.train import train_run import pytest -from pytest_lazyfixture import lazy_fixture +from pytest_lazy_fixtures import lf import logging @@ -20,9 +20,9 @@ @pytest.mark.parametrize( "run_config", [ - lazy_fixture("distance_run"), - lazy_fixture("dummy_run"), - lazy_fixture("onehot_run"), + lf("distance_run"), + lf("dummy_run"), + lf("onehot_run"), ], ) def test_train( diff --git a/tests/operations/test_validate.py b/tests/operations/test_validate.py index fa2cc6b9..1fc2a6e8 100644 --- a/tests/operations/test_validate.py +++ b/tests/operations/test_validate.py @@ -5,10 +5,10 @@ from dacapo.experiments import Run from dacapo.store.create_store import create_config_store, create_weights_store -from dacapo import validate +from dacapo import validate_run import pytest -from pytest_lazyfixture import lazy_fixture +from pytest_lazy_fixtures import lf import logging @@ -18,8 +18,8 @@ @pytest.mark.parametrize( "run_config", [ - lazy_fixture("distance_run"), - # lazy_fixture("onehot_run"), + lf("distance_run"), + lf("onehot_run"), ], ) def test_validate( @@ -56,13 +56,13 @@ def test_validate( # test validating iterations for which we know there are weights weights_store.store_weights(run, 0) - validate(run_config.name, 0, num_workers=4) + validate_run(run_config.name, 0, num_workers=4) # weights_store.store_weights(run, 1) - # validate(run_config.name, 1, num_workers=4) + # validate_run(run_config.name, 1, num_workers=4) # test validating weights that don't exist with pytest.raises(FileNotFoundError): - validate(run_config.name, 2, num_workers=4) + validate_run(run_config.name, 2, num_workers=4) if debug: os.chdir(old_path)