From 2eef2d234f6202b1ba45169b8031f959e8de706c Mon Sep 17 00:00:00 2001 From: David Ackerman Date: Tue, 19 Mar 2024 08:58:25 -0400 Subject: [PATCH 01/20] fix formatting --- .../random_source_pipeline-checkpoint.py | 288 +++++++++ .../synthetic_source_worker-checkpoint.py | 216 +++++++ .../.ipynb_checkpoints/utils-checkpoint.py | 242 ++++++++ .../cosem_example-checkpoint.ipynb | 362 ++++++++++++ ...example_fill_in_the_blank-checkpoint.ipynb | 236 ++++++++ .../cosem_finetune_example-checkpoint.ipynb | 271 +++++++++ .../synthetic_example-checkpoint.ipynb | 548 ++++++++++++++++++ .../synthetic_example-checkpoint.py | 385 ++++++++++++ .../distance_task/cosem_example.ipynb | 2 +- .../cosem_example_fill_in_the_blank.py | 17 +- dacapo/examples/synthetic_source_worker.py | 1 + dacapo/examples/utils.py | 4 +- 12 files changed, 2561 insertions(+), 11 deletions(-) create mode 100644 dacapo/examples/.ipynb_checkpoints/random_source_pipeline-checkpoint.py create mode 100644 dacapo/examples/.ipynb_checkpoints/synthetic_source_worker-checkpoint.py create mode 100644 dacapo/examples/.ipynb_checkpoints/utils-checkpoint.py create mode 100644 dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example-checkpoint.ipynb create mode 100644 dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example_fill_in_the_blank-checkpoint.ipynb create mode 100644 dacapo/examples/distance_task/.ipynb_checkpoints/cosem_finetune_example-checkpoint.ipynb create mode 100644 dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.ipynb create mode 100644 dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.py diff --git a/dacapo/examples/.ipynb_checkpoints/random_source_pipeline-checkpoint.py b/dacapo/examples/.ipynb_checkpoints/random_source_pipeline-checkpoint.py new file mode 100644 index 00000000..8dbc950b --- /dev/null +++ b/dacapo/examples/.ipynb_checkpoints/random_source_pipeline-checkpoint.py @@ -0,0 +1,288 @@ +from typing import Iterable +import gunpowder as gp +import logging +import numpy as np +import random +from scipy.ndimage import ( + distance_transform_edt, + gaussian_filter, +) +from skimage.measure import label as relabel + +logging.basicConfig(level=logging.INFO) + + +class CreatePoints(gp.BatchFilter): + def __init__( + self, + labels, + num_points=(20, 150), + ): + self.labels = labels + self.num_points = num_points + + def process(self, batch, request): + labels = batch[self.labels].data + + num_points = random.randint(*self.num_points) + + z = np.random.randint(1, labels.shape[0] - 1, num_points) + y = np.random.randint(1, labels.shape[1] - 1, num_points) + x = np.random.randint(1, labels.shape[2] - 1, num_points) + + labels[z, y, x] = 1 + + batch[self.labels].data = labels + + +class MakeRaw(gp.BatchFilter): + def __init__( + self, + raw, + labels, + gaussian_noise_args: Iterable = (0.5, 0.1), + gaussian_noise_lim: float = 0.3, + gaussian_blur_args: Iterable = (0.5, 1.5), + membrane_like=True, + membrane_size=3, + inside_value=0.5, + ): + self.raw = raw + self.labels = labels + self.gaussian_noise_args = gaussian_noise_args + self.gaussian_noise_lim = gaussian_noise_lim + self.gaussian_blur_args = gaussian_blur_args + self.membrane_like = membrane_like + self.membrane_size = membrane_size + self.inside_value = inside_value + + def setup(self): + spec = self.spec[self.labels].copy() # type: ignore + spec.dtype = np.float32 + self.provides(self.raw, spec) + + def process(self, batch, request): + labels = batch[self.labels].data + raw: np.ndarray = np.zeros_like(labels, dtype=np.float32) + raw[labels > 0] = 1 + + # generate membrane-like structure + if self.membrane_like: + for id in np.unique(labels): + if id == 0: + continue + raw[distance_transform_edt(labels == id) > self.membrane_size] = self.inside_value # type: ignore + + # now add blur + raw = gaussian_filter(raw, random.uniform(*self.gaussian_blur_args)) + + # now add noise + noise = np.random.normal(*self.gaussian_noise_args, raw.shape) # type: ignore + # normalize to [0, gaussian_noise_lim] + noise -= noise.min() + noise /= noise.max() + noise *= self.gaussian_noise_lim + + raw += noise + raw /= 1 + self.gaussian_noise_lim + raw = 1 - raw # invert + raw.clip(0, 1, out=raw) + + # add to batch + spec = self._spec[self.raw].copy() # type: ignore + spec.roi = request[self.raw].roi + batch[self.raw] = gp.Array(raw, spec) + + +class DilatePoints(gp.BatchFilter): + def __init__(self, labels, dilations=[2, 8]): + self.labels = labels + self.dilations = dilations + + def process(self, batch, request): + labels = batch[self.labels].data + + dilations = random.randint(*self.dilations) + labels = (distance_transform_edt(labels == 0) <= dilations).astype(labels.dtype) # type: ignore + + batch[self.labels].data = labels + + +class RandomDilateLabels(gp.BatchFilter): + def __init__(self, labels, dilations=[2, 8]): + self.labels = labels + self.dilations = dilations + + def process(self, batch, request): + labels = batch[self.labels].data + + new_labels = np.zeros_like(labels) + for id in np.unique(labels): + if id == 0: + continue + dilations = np.random.randint(*self.dilations) + + # # make sure we don't overlap existing labels + new_labels[ + np.logical_or( + labels == id, + np.logical_and( + distance_transform_edt(labels != id) <= dilations, labels == 0 + ), + ) + ] = id # type: ignore + + batch[self.labels].data = new_labels + + +class Relabel(gp.BatchFilter): + def __init__(self, labels, connectivity=1): + self.labels = labels + self.connectivity = connectivity + + def process(self, batch, request): + labels = batch[self.labels].data + + relabeled = relabel(labels, connectivity=self.connectivity).astype(labels.dtype) # type: ignore + + batch[self.labels].data = relabeled + + +class ExpandLabels(gp.BatchFilter): + def __init__(self, labels, background=0): + self.labels = labels + self.background = background + + def process(self, batch, request): + labels_data = batch[self.labels].data + distance = labels_data.shape[0] + + distances, indices = distance_transform_edt( + labels_data == self.background, return_indices=True + ) # type: ignore + + expanded_labels = np.zeros_like(labels_data) + + dilate_mask = distances <= distance + + masked_indices = [ + dimension_indices[dilate_mask] for dimension_indices in indices + ] + + nearest_labels = labels_data[tuple(masked_indices)] + + expanded_labels[dilate_mask] = nearest_labels + + batch[self.labels].data = expanded_labels + + +class ZerosSource(gp.BatchProvider): + def __init__(self, key, spec): + self.key = key + self._spec = {key: spec} + + def setup(self): + pass + + def provide(self, request): + batch = gp.Batch() + + roi = request[self.key].roi + shape = (roi / self._spec[self.key].voxel_size).get_shape() + spec = self._spec[self.key].copy() + spec.roi = roi + + batch.arrays[self.key] = gp.Array(np.zeros(shape, dtype=spec.dtype), spec) + + return batch + + +def random_source_pipeline( + voxel_size=(8, 8, 8), + input_shape=(148, 148, 148), + dtype=np.uint8, + expand_labels=False, + relabel_connectivity=1, + random_dilate=True, + num_points=(20, 150), + gaussian_noise_args=(0, 0.1), + gaussian_blur_args=(0.5, 1.5), + membrane_like=True, + membrane_size=3, + inside_value=0.5, +): + """Create a random source pipeline and batch request for example training. + + Args: + + voxel_size (tuple of int): The size of a voxel in world units. + input_shape (tuple of int): The shape of the input arrays. + dtype (numpy.dtype): The dtype of the label arrays. + expand_labels (bool): Whether to expand the labels into the background. + relabel_connectivity (int): The connectivity used for for relabeling. + random_dilate (bool): Whether to randomly dilate the individual labels. + num_points (tuple of int): The range of the number of points to add to the labels. + gaussian_noise_args (tuple of float): The mean and standard deviation of the Gaussian noise to add to the raw array. + gaussian_blur_args (tuple of float): The mean and standard deviation of the Gaussian blur to apply to the raw array. + membrane_like (bool): Whether to generate a membrane-like structure in the raw array. + membrane_size (int): The width of the membrane-like structure on the outside of the objects. + inside_value (float): The value to set inside the membranes of objects. + + Returns: + + gunpowder.Pipeline: The batch generating Gunpowder pipeline. + gunpowder.BatchRequest: The batch request for the pipeline. + """ + + voxel_size = gp.Coordinate(voxel_size) + input_shape = gp.Coordinate(input_shape) + + labels = gp.ArrayKey("LABELS") + raw = gp.ArrayKey("RAW") + + input_size = input_shape * voxel_size + + request = gp.BatchRequest() + + request.add(labels, input_size) + request.add(raw, input_size) + + source_spec = gp.ArraySpec( + roi=gp.Roi((0, 0, 0), input_size), voxel_size=voxel_size, dtype=dtype + ) + source = ZerosSource(labels, source_spec) + + pipeline = source + + # randomly sample some points and write them into our zeros array as ones + pipeline += CreatePoints(labels, num_points=num_points) + + # grow the boundaries + pipeline += DilatePoints(labels) + + # relabel connected components + pipeline += Relabel(labels, connectivity=relabel_connectivity) + + if expand_labels: + # expand the labels outwards into the background + pipeline += ExpandLabels(labels) + + # relabel ccs again to deal with incorrectly connected background + pipeline += Relabel(labels, connectivity=relabel_connectivity) + + # randomly dilate labels + if random_dilate: + pipeline += RandomDilateLabels(labels) + + # make a raw array + pipeline += MakeRaw( + raw, + labels, + gaussian_noise_args=gaussian_noise_args, + gaussian_blur_args=gaussian_blur_args, + membrane_like=membrane_like, + membrane_size=membrane_size, + inside_value=inside_value, + ) + + return pipeline, request diff --git a/dacapo/examples/.ipynb_checkpoints/synthetic_source_worker-checkpoint.py b/dacapo/examples/.ipynb_checkpoints/synthetic_source_worker-checkpoint.py new file mode 100644 index 00000000..cdf21624 --- /dev/null +++ b/dacapo/examples/.ipynb_checkpoints/synthetic_source_worker-checkpoint.py @@ -0,0 +1,216 @@ +from dacapo.examples.random_source_pipeline import random_source_pipeline +import gunpowder as gp + + +from pathlib import Path +import sys +from dacapo.experiments.datasplits.datasets.arrays.zarr_array import ZarrArray +from dacapo.store.array_store import LocalArrayIdentifier +from dacapo.compute_context import create_compute_context +import dacapo + +import daisy +from funlib.geometry import Coordinate, Roi + +import numpy as np +import click + +import logging + +logger = logging.getLogger(__file__) + +read_write_conflict: bool = False +fit: str = "shrink" +path = __file__ + + +@click.group() +@click.option( + "--log-level", + type=click.Choice( + ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False + ), + default="INFO", +) +def cli(log_level): + logging.basicConfig(level=getattr(logging, log_level.upper())) + + +fit = "valid" + +def generate_synthetic_dataset( + output_container: Path | str, + raw_output_dataset: str = "raw", + labels_output_dataset: str = "labels", + shape: str | Coordinate = Coordinate((512, 512, 512)), + voxel_size: str | Coordinate = Coordinate((8, 8, 8)), + write_shape: str | Coordinate = Coordinate((256, 256, 256)), + num_workers: int = 16, + overwrite: bool = False, +): + # get ROI from string + if isinstance(voxel_size, str): + _voxel_size = Coordinate([int(v) for v in voxel_size.split(",")]) + else: + _voxel_size = voxel_size + if isinstance(shape, str): + _shape = Coordinate([int(v) for v in shape.split(",")]) + else: + _shape = shape + if isinstance(write_shape, str): + _write_shape = Coordinate([int(v) for v in write_shape.split(",")]) + else: + _write_shape = write_shape + roi = Roi((0, 0, 0), _shape * _voxel_size) + read_roi = write_roi = Roi((0, 0, 0), _write_shape * _voxel_size) + + # get arrays + raw_output_array_identifier = LocalArrayIdentifier( + Path(output_container), raw_output_dataset + ) + raw_output_array = ZarrArray.create_from_array_identifier( + raw_output_array_identifier, + roi=roi, + dtype=np.uint8, + voxel_size=_voxel_size, + num_channels=None, + axes=["z", "y", "x"], + overwrite=overwrite, + write_size=_write_shape * voxel_size, + ) + + labels_output_array_identifier = LocalArrayIdentifier( + Path(output_container), labels_output_dataset + ) + labels_output_array = ZarrArray.create_from_array_identifier( + labels_output_array_identifier, + roi=roi, + dtype=np.uint64, + voxel_size=_voxel_size, + num_channels=None, + axes=["z", "y", "x"], + overwrite=overwrite, + write_size=_write_shape * voxel_size, + ) + + # make daisy blockwise task + dacapo.run_blockwise( + __file__, + roi, + read_roi, + write_roi, + num_workers=num_workers, + raw_output_array_identifier=raw_output_array_identifier, + labels_output_array_identifier=labels_output_array_identifier, + ) + + +@cli.command() +@click.option( + "-oc", "--output_container", required=True, type=click.Path(file_okay=False) +) +@click.option("-rod", "--raw_output_dataset", required=True, type=str) +@click.option("-lod", "--labels_output_dataset", required=True, type=str) +def start_worker( + output_container: Path | str, + raw_output_dataset: str, + labels_output_dataset: str, +): + # get arrays + raw_output_array_identifier = LocalArrayIdentifier( + Path(output_container), raw_output_dataset + ) + raw_output_array = ZarrArray.open_from_array_identifier(raw_output_array_identifier) + + labels_output_array_identifier = LocalArrayIdentifier( + Path(output_container), labels_output_dataset + ) + labels_output_array = ZarrArray.open_from_array_identifier( + labels_output_array_identifier + ) + + # get data generator + + def batch_generator(shape=(128, 128, 128), voxel_size=(8, 8, 8)): + pipeline, request = random_source_pipeline( + input_shape=shape, voxel_size=voxel_size + ) + with gp.build(pipeline): + while True: + yield pipeline.request_batch(request) + + batch_gen = None + + id_offset = None + + # wait for blocks to run pipeline + client = daisy.Client() + + while True: + print("getting block") + with client.acquire_block() as block: + if block is None: + break + + if batch_gen is None or id_offset is None: + size = block.write_roi.get_shape() + voxel_size = raw_output_array.voxel_size + shape = Coordinate(size / voxel_size) + batch_gen = batch_generator( + shape=shape, + voxel_size=voxel_size, + ) + id_offset = np.prod(shape) # number of voxels in the block + batch = next(batch_gen) + raw_array = batch.arrays[gp.ArrayKey("RAW")] + labels_array = batch.arrays[gp.ArrayKey("LABELS")] + + raw_data = raw_array.data + raw_data -= raw_data.min() + raw_data /= raw_data.max() + raw_data *= 255 + raw_data = raw_data.astype(np.uint8) + labels_data = labels_array.data.astype(np.uint64) + labels_data += np.uint64(id_offset * block.block_id[1]) + labels_data[labels_data == np.uint64(id_offset * block.block_id[1])] = 0 + + # write to output array + raw_output_array[block.write_roi] = raw_data + labels_output_array[block.write_roi] = labels_data + + +def spawn_worker( + raw_output_array_identifier: "LocalArrayIdentifier", + labels_output_array_identifier: "LocalArrayIdentifier", +): + """Spawn a worker to generate a synthetic dataset. + + Args: + raw_output_array_identifier (LocalArrayIdentifier): The identifier of the raw output array. + labels_output_array_identifier (LocalArrayIdentifier): The identifier of the labels output array. + """ + compute_context = create_compute_context() + + # Make the command for the worker to run + command = [ + # "python", + sys.executable, + path, + "start-worker", + "--output_container", + raw_output_array_identifier.container, + "--raw_output_dataset", + raw_output_array_identifier.dataset, + "--labels_output_dataset", + labels_output_array_identifier.dataset, + ] + + def run_worker(): + # Run the worker in the given compute context + compute_context.execute(command) + + return run_worker + + +if __name__ == "__main__": + cli() diff --git a/dacapo/examples/.ipynb_checkpoints/utils-checkpoint.py b/dacapo/examples/.ipynb_checkpoints/utils-checkpoint.py new file mode 100644 index 00000000..184e594f --- /dev/null +++ b/dacapo/examples/.ipynb_checkpoints/utils-checkpoint.py @@ -0,0 +1,242 @@ +from typing import Optional +import neuroglancer +from IPython.display import IFrame +import numpy as np +import gunpowder as gp +from funlib.persistence import Array +from dacapo.experiments.datasplits.datasets.arrays import ZarrArray +from funlib.persistence import open_ds +from threading import Thread +import neuroglancer +from neuroglancer.viewer_state import ViewerState +import os +from dacapo.experiments.run import Run +from dacapo.store.create_store import create_array_store +from IPython.display import IFrame +import time +import copy +import json + +def get_viewer( + raw_array: gp.Array | Array | ZarrArray, + labels_array: gp.Array | Array | ZarrArray, + pred_array: Optional[gp.Array | Array | ZarrArray] = None, + pred_labels_array: Optional[gp.Array | Array | ZarrArray] = None, + width: int = 1500, + height: int = 600, +) -> IFrame: + arrays = { + "raw": raw_array, + "labels": labels_array, + } + if pred_array is not None: + arrays["pred"] = pred_array + if pred_labels_array is not None: + arrays["pred_labels"] = pred_labels_array + + data = {} + voxel_sizes = {} + for name, array in arrays.items(): + if hasattr(array, "to_ndarray"): + data[name] = array.to_ndarray() + else: + data[name] = array.data + if hasattr(array, "voxel_size"): + voxel_sizes[name] = array.voxel_size + else: + voxel_sizes[name] = array.spec.voxel_size + + neuroglancer.set_server_bind_address("0.0.0.0") + viewer = neuroglancer.Viewer() + with viewer.txn() as state: + state.showSlices = False + add_seg_layer(state, "labels", data["labels"], voxel_sizes["labels"]) + + add_scalar_layer(state, "raw", data["raw"], voxel_sizes["raw"]) + + if "pred" in data: + add_scalar_layer(state, "pred", data["pred"], voxel_sizes["pred"]) + + if "pred_labels" in data: + add_seg_layer( + state, "pred_labels", data["pred_labels"], voxel_sizes["pred_labels"] + ) + + return IFrame(src=viewer, width=width, height=height) + + +class NeuroglancerRunViewer: + def __init__(self, run: Run): + self.run: Run = run + self.most_recent_iteration = 0 + self.prediction = None + + def updated_neuroglancer_layer(self, layer_name, ds): + source = neuroglancer.LocalVolume( + data=ds.data, + dimensions=neuroglancer.CoordinateSpace( + names=["c", "z", "y", "x"], + units=["", "nm", "nm", "nm"], + scales=[1] + list(ds.voxel_size), + ), + voxel_offset=[0] + list(ds.roi.offset), + ) + new_state = copy.deepcopy(self.viewer.state) + if len(new_state.layers) == 1: + new_state.layers[layer_name] = neuroglancer.ImageLayer(source=source) + else: + # replace name everywhere to preserve state, like what is selected + new_state_str = json.dumps(new_state.to_json()) + new_state_str = new_state_str.replace(new_state.layers[-1].name, layer_name) + new_state = ViewerState(json.loads(new_state_str)) + new_state.layers[layer_name].source = source + + self.viewer.set_state(new_state) + print(self.viewer.state) + + def deprecated_start_neuroglancer(self): + neuroglancer.set_server_bind_address("0.0.0.0") + self.viewer = neuroglancer.Viewer() + + def start_neuroglancer(self): + neuroglancer.set_server_bind_address("0.0.0.0") + self.viewer = neuroglancer.Viewer() + with self.viewer.txn() as state: + state.showSlices = False + + state.layers["raw"] = neuroglancer.ImageLayer( + source=neuroglancer.LocalVolume( + data=self.raw.data, + dimensions=neuroglancer.CoordinateSpace( + names=["z", "y", "x"], + units=["nm", "nm", "nm"], + scales=self.raw.voxel_size, + ), + voxel_offset=self.raw.roi.offset, + ), + ) + return IFrame(src=self.viewer, width=1800, height=900) + + def start(self): + self.array_store = create_array_store() + self.get_datasets() + self.new_validation_checker() + return self.start_neuroglancer() + + def open_from_array_identitifier(self, array_identifier): + if os.path.exists(array_identifier.container / array_identifier.dataset): + return open_ds(str(array_identifier.container), array_identifier.dataset) + else: + return None + + def get_datasets(self): + for validation_dataset in self.run.datasplit.validate: + ( + input_raw_array_identifier, + input_gt_array_identifier, + ) = self.array_store.validation_input_arrays( + self.run.name, validation_dataset.name + ) + + self.raw = self.open_from_array_identitifier(input_raw_array_identifier) + self.gt = self.open_from_array_identitifier(input_gt_array_identifier) + print(self.raw) + + def update_best_info(self, iteration, validation_dataset_name): + prediction_array_identifier = self.array_store.validation_prediction_array( + self.run.name, + iteration, + validation_dataset_name, + ) + self.prediction = self.open_from_array_identitifier(prediction_array_identifier) + self.most_recent_iteration = iteration + + def update_neuroglancer(self, iteration): + self.updated_neuroglancer_layer( + f"prediction at iteration {iteration}", self.prediction + ) + return None + + def update_best(self, iteration, validation_dataset_name): + self.update_best_info(iteration, validation_dataset_name) + self.update_neuroglancer(iteration) + + def new_validation_checker(self): + self.process = Thread(target=self.update_with_new_validation_if_possible) + self.process.daemon = True + self.process.start() + + def update_with_new_validation_if_possible(self): + # Here we are assuming that we are checking the directory .../valdiation_config/prediction + # Ideally we will only have to check for the current best validation + while True: + time.sleep(3) + for validation_dataset in self.run.datasplit.validate: + most_recent_iteration_previous = self.most_recent_iteration + prediction_array_identifier = ( + self.array_store.validation_prediction_array( + self.run.name, + self.most_recent_iteration, + validation_dataset.name, + ) + ) + + container = prediction_array_identifier.container + if os.path.exists(container): + iteration_dirs = [ + name + for name in os.listdir(container) + if os.path.isdir(os.path.join(container, name)) + and name.isnumeric() + ] + + for iteration_dir in iteration_dirs: + if int(iteration_dir) > self.most_recent_iteration: + inference_dir = os.path.join( + container, + iteration_dir, + "validation_config", + "prediction", + ) + if os.path.exists(inference_dir): + # Ignore basic zarr and n5 files + inference_dir_contents = [ + f + for f in os.listdir(inference_dir) + if not f.startswith(".") and not f.endswith(".json") + ] + if inference_dir_contents: + # then it should have at least a chunk writtent out, assume it has all of it written out + self.most_recent_iteration = int(iteration_dir) + if most_recent_iteration_previous != self.most_recent_iteration: + self.update_best( + self.most_recent_iteration, + validation_dataset.name, + ) + +def add_seg_layer(state, name, data, voxel_size): + state.layers[name] = neuroglancer.SegmentationLayer( + # segments=[str(i) for i in np.unique(data[data > 0])], # this line will cause all objects to be selected and thus all meshes to be generated...will be slow if lots of high res meshes + source=neuroglancer.LocalVolume( + data=data, + dimensions=neuroglancer.CoordinateSpace( + names=["z", "y", "x"], + units=["nm", "nm", "nm"], + scales=voxel_size, + ), + ), + segments=np.unique(data[data > 0]), + ) + + +def add_scalar_layer(state, name, data, voxel_size): + state.layers[name] = neuroglancer.ImageLayer( + source=neuroglancer.LocalVolume( + data=data, + dimensions=neuroglancer.CoordinateSpace( + names=["z", "y", "x"], + units=["nm", "nm", "nm"], + scales=voxel_size, + ), + ), + ) diff --git a/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example-checkpoint.ipynb b/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example-checkpoint.ipynb new file mode 100644 index 00000000..3c974cf1 --- /dev/null +++ b/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example-checkpoint.ipynb @@ -0,0 +1,362 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " # Dacapo\n", + "\n", + " DaCapo is a framework that allows for easy configuration and execution of established machine learning techniques on arbitrarily large volumes of multi-dimensional images.\n", + "\n", + " DaCapo has 4 major configurable components:\n", + " 1. **dacapo.datasplits.DataSplit**\n", + "\n", + " 2. **dacapo.tasks.Task**\n", + "\n", + " 3. **dacapo.architectures.Architecture**\n", + "\n", + " 4. **dacapo.trainers.Trainer**\n", + "\n", + " These are then combined in a single **dacapo.experiments.Run** that includes your starting point (whether you want to start training from scratch or continue off of a previously trained model) and stopping criterion (the number of iterations you want to train)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Environment setup\n", + " If you have not already done so, you will need to install DaCapo. You can do this by first creating a new environment and then installing DaCapo using pip.\n", + "\n", + " ```bash\n", + " conda create -n dacapo python=3.10\n", + " conda activate dacapo\n", + " ```\n", + "\n", + " Then, you can install DaCapo using pip, via GitHub:\n", + "\n", + " ```bash\n", + " pip install git+https://github.com/janelia-cellmap/dacapo.git\n", + " ```\n", + "\n", + " Or you can clone the repository and install it locally:\n", + "\n", + " ```bash\n", + " git clone https://github.com/janelia-cellmap/dacapo.git\n", + " cd dacapo\n", + " pip install -e .\n", + " ```\n", + "\n", + " Be sure to select this environment in your Jupyter notebook or JupyterLab." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Config Store\n", + " To define where the data goes, create a dacapo.yaml configuration file either in `~/.config/dacapo/dacapo.yaml` or in `./dacapo.yaml`. Here is a template:\n", + "\n", + " ```yaml\n", + " type: files\n", + " runs_base_dir: /path/to/my/data/storage\n", + " ```\n", + " The `runs_base_dir` defines where your on-disk data will be stored. The `type` setting determines the database backend. The default is `files`, which stores the data in a file tree on disk. Alternatively, you can use `mongodb` to store the data in a MongoDB database. To use MongoDB, you will need to provide a `mongodbhost` and `mongodbname` in the configuration file:\n", + "\n", + " ```yaml\n", + " ...\n", + " mongodbhost: mongodb://dbuser:dbpass@dburl:dbport/\n", + " mongodbname: dacapo" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating FileConfigStore:\n", + "\tpath: /nrs/cellmap/ackermand/dacapo_learnathon/configs\n" + ] + } + ], + "source": [ + "# First we need to create a config store to store our configurations\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Datasplit\n", + " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", + " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", + " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:dacapo.experiments.datasplits.datasplit_generator: No targets specified, using all classes in the dataset as target ['mito'].\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Neuroglancer link: http://h10u28.int.janelia.org:19399/v/a9fea3fb1009ac31987b6fb7d5ecb032fcff77db/\n" + ] + } + ], + "source": [ + "from dacapo.experiments.datasplits import DataSplitGenerator\n", + "from funlib.geometry import Coordinate\n", + "\n", + "input_resolution = Coordinate(8, 8, 8)\n", + "output_resolution = Coordinate(4, 4, 4)\n", + "datasplit_config = DataSplitGenerator.generate_from_csv(\n", + " \"/misc/public/dacapo_learnathon/datasplit_csvs/cosem_example.csv\", input_resolution, output_resolution\n", + ").compute()\n", + "\n", + "datasplit = datasplit_config.datasplit_type(datasplit_config)\n", + "viewer = datasplit._neuroglancer()\n", + "config_store.store_datasplit_config(datasplit_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Task\n", + " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", + " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", + " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", + " also require specific non-linearities or output formats from your model." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.tasks import DistanceTaskConfig\n", + "\n", + "task_config = DistanceTaskConfig(\n", + " name=\"cosem_distance_task_4nm\",\n", + " channels=[\"mito\"],\n", + " clip_distance=40.0,\n", + " tol_distance=40.0,\n", + " scale_factor=80.0,\n", + ")\n", + "config_store.store_task_config(task_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Architecture\n", + "\n", + " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", + "\n", + "architecture_config = CNNectomeUNetConfig(\n", + " name=\"upsample_unet\",\n", + " input_shape=Coordinate(216, 216, 216),\n", + " eval_shape_increase=Coordinate(72, 72, 72),\n", + " fmaps_in=1,\n", + " num_fmaps=12,\n", + " fmaps_out=72,\n", + " fmap_inc_factor=6,\n", + " downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],\n", + " constant_upsample=True,\n", + " upsample_factors=[(2, 2, 2)],\n", + ")\n", + "config_store.store_architecture_config(architecture_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Trainer\n", + "\n", + " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", + "from dacapo.experiments.trainers.gp_augments import (\n", + " ElasticAugmentConfig,\n", + " GammaAugmentConfig,\n", + " IntensityAugmentConfig,\n", + " IntensityScaleShiftAugmentConfig,\n", + ")\n", + "\n", + "trainer_config = GunpowderTrainerConfig(\n", + " name=\"cosem\",\n", + " batch_size=1,\n", + " learning_rate=0.0001,\n", + " num_data_fetchers=20,\n", + " augments=[\n", + " ElasticAugmentConfig(\n", + " control_point_spacing=[100, 100, 100],\n", + " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", + " rotation_interval=(0.0, 1.5707963267948966),\n", + " subsample=8,\n", + " uniform_3d_rotation=True,\n", + " ),\n", + " IntensityAugmentConfig(scale=(0.25, 1.75), shift=(-0.5, 0.35), clip=True),\n", + " GammaAugmentConfig(gamma_range=(0.5, 2.0)),\n", + " IntensityScaleShiftAugmentConfig(scale=2.0, shift=-1.0),\n", + " ],\n", + " snapshot_interval=10000,\n", + " min_masked=0.05,\n", + " clip_raw=True,\n", + ")\n", + "config_store.store_trainer_config(trainer_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Run\n", + " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments import RunConfig\n", + "from dacapo.experiments.run import Run\n", + "\n", + "start_config = None\n", + "\n", + "# Uncomment to start from a pretrained model\n", + "# start_config = StartConfig(\n", + "# \"setup04\",\n", + "# \"best\",\n", + "# )\n", + "\n", + "iterations = 2000\n", + "validation_interval = 50\n", + "repetitions = 1\n", + "for i in range(repetitions):\n", + " run_config = RunConfig(\n", + " name=\"cosem_distance_run_4nm\",\n", + " # # NOTE: This is a template for the name of the run. You can customize it as you see fit.\n", + " # name=(\"_\").join(\n", + " # [\n", + " # \"example\",\n", + " # \"scratch\" if start_config is None else \"finetuned\",\n", + " # datasplit_config.name,\n", + " # task_config.name,\n", + " # architecture_config.name,\n", + " # trainer_config.name,\n", + " # ]\n", + " # )\n", + " # + f\"__{i}\",\n", + " datasplit_config=datasplit_config,\n", + " task_config=task_config,\n", + " architecture_config=architecture_config,\n", + " trainer_config=trainer_config,\n", + " num_iterations=iterations,\n", + " validation_interval=validation_interval,\n", + " repetition=i,\n", + " start_config=start_config,\n", + " )\n", + "\n", + " print(run_config.name)\n", + " config_store.store_run_config(run_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Train\n", + " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", + " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.train import train_run\n", + "from dacapo.experiments.run import Run\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "\n", + "run = Run(config_store.retrieve_run_config(\"cosem_distance_run_4nm\"))\n", + "train_run(run)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements.\n", + " # %%\n", + " from dacapo.validate import validate\n", + " # validate(run_config.name, iterations, num_workers=32)\n", + " validate(\"cosem_distance_run\", 1500, num_workers=10)\n", + " # %%" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "DaCapo Learnathon", + "language": "python", + "name": "dacapo_learnathon" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example_fill_in_the_blank-checkpoint.ipynb b/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example_fill_in_the_blank-checkpoint.ipynb new file mode 100644 index 00000000..83134d93 --- /dev/null +++ b/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example_fill_in_the_blank-checkpoint.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First we need to create a config store to store our configurations\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "# create the config store\n", + "config_store = ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Datasplit\n", + " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", + " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", + " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.datasplits import DataSplitGenerator\n", + "from funlib.geometry import Coordinate\n", + "\n", + "# We will be working with cosem data and we want to work with 8nm isotropic input resolution for the raw data and output at 4 nm resolution.\n", + "# Create these resolutions as Coordinates.\n", + "input_resolution = ...\n", + "output_resolution = ...\n", + "\n", + "# Create the datasplit config using the cosem_example.csv located in the shared learnathon examples\n", + "datasplit_config = ...\n", + "\n", + "# Create the datasplit, produce the neuroglancer link and store the datasplit\n", + "datasplit = ...\n", + "viewer = ...\n", + "config_store...\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Task\n", + " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", + " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", + " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", + " also require specific non-linearities or output formats from your model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.tasks import DistanceTaskConfig\n", + "\n", + "# Create a distance task config where the clip_distance=tol_distance=10x the output resolution,\n", + "# and scale_factor = 20x the output resolution\n", + "task_config = \n", + "config_store....\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Architecture\n", + "\n", + " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", + "\n", + "architecture_config = CNNectomeUNetConfig(\n", + " name=\"upsample_unet\",\n", + " input_shape=Coordinate(216, 216, 216),\n", + " eval_shape_increase=Coordinate(72, 72, 72),\n", + " fmaps_in=1,\n", + " num_fmaps=12,\n", + " fmaps_out=72,\n", + " fmap_inc_factor=6,\n", + " downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],\n", + " constant_upsample=True,\n", + " upsample_factors=[(2, 2, 2)],\n", + ")\n", + "config_store.store_architecture_config(architecture_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Trainer\n", + "\n", + " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", + "from dacapo.experiments.trainers.gp_augments import (\n", + " ElasticAugmentConfig,\n", + " GammaAugmentConfig,\n", + " IntensityAugmentConfig,\n", + " IntensityScaleShiftAugmentConfig,\n", + ")\n", + "\n", + "trainer_config = GunpowderTrainerConfig(\n", + " name=\"cosem\",\n", + " batch_size=1,\n", + " learning_rate=0.0001,\n", + " num_data_fetchers=20,\n", + " augments=[\n", + " ElasticAugmentConfig(\n", + " control_point_spacing=[100, 100, 100],\n", + " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", + " rotation_interval=(0.0, 1.5707963267948966),\n", + " subsample=8,\n", + " uniform_3d_rotation=True,\n", + " ),\n", + " # Create an intensity augment config scaling from .25 to 1.25, shifting from -.5 to .35, and with clipping\n", + " ...,\n", + " # Create a gamma augment config with range .5 to 2\n", + " ...,\n", + " # Create an intensity scale shift agument config to rescale data from the range 0->1 to -1->1\n", + " ...,\n", + " ],\n", + " snapshot_interval=10000,\n", + " min_masked=0.05,\n", + " clip_raw=True,\n", + ")\n", + "# Store the trainer\n", + "config_store....\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Run\n", + " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments import RunConfig\n", + "from dacapo.experiments.run import Run\n", + "\n", + "start_config = None\n", + "\n", + "# Uncomment to start from a pretrained model\n", + "# start_config = StartConfig(\n", + "# \"setup04\",\n", + "# \"best\",\n", + "# )\n", + "\n", + "iterations = 2000\n", + "validation_interval = iterations // 2\n", + "# Set up a run using all of the configs and settings you created above\n", + "run_config = ...\n", + "\n", + "print(run_config.name)\n", + "config_store...\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Train\n", + " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", + " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.train import train_run\n", + "from dacapo.experiments.run import Run\n", + "# load the run and train it\n", + "run = Run(config_store...)\n", + "train_run(run)\n", + "" + ] + } + ], + "nbformat": 4, + "nbformat_minor": 2, + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + } + } +} \ No newline at end of file diff --git a/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_finetune_example-checkpoint.ipynb b/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_finetune_example-checkpoint.ipynb new file mode 100644 index 00000000..cabecc87 --- /dev/null +++ b/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_finetune_example-checkpoint.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First we need to create a config store to store our configurations\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Datasplit\n", + " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", + " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", + " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.datasplits import DataSplitGenerator\n", + "from funlib.geometry import Coordinate\n", + "\n", + "input_resolution = Coordinate(8, 8, 8)\n", + "output_resolution = Coordinate(4, 4, 4)\n", + "datasplit_config = DataSplitGenerator.generate_from_csv(\n", + " \"cosem_example.csv\", input_resolution, output_resolution\n", + ").compute()\n", + "\n", + "datasplit = datasplit_config.datasplit_type(datasplit_config)\n", + "viewer = datasplit._neuroglancer()\n", + "config_store.store_datasplit_config(datasplit_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Task\n", + " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", + " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", + " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", + " also require specific non-linearities or output formats from your model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.tasks import DistanceTaskConfig\n", + "\n", + "task_config = DistanceTaskConfig(\n", + " name=\"cosem_distance_task_4nm\",\n", + " channels=[\"mito\"],\n", + " clip_distance=40.0,\n", + " tol_distance=40.0,\n", + " scale_factor=80.0,\n", + ")\n", + "config_store.store_task_config(task_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Architecture\n", + "\n", + " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", + "\n", + "architecture_config = CNNectomeUNetConfig(\n", + " name=\"upsample_unet\",\n", + " input_shape=Coordinate(216, 216, 216),\n", + " eval_shape_increase=Coordinate(72, 72, 72),\n", + " fmaps_in=1,\n", + " num_fmaps=12,\n", + " fmaps_out=72,\n", + " fmap_inc_factor=6,\n", + " downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],\n", + " constant_upsample=True,\n", + " upsample_factors=[(2, 2, 2)],\n", + ")\n", + "config_store.store_architecture_config(architecture_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Trainer\n", + "\n", + " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", + "from dacapo.experiments.trainers.gp_augments import (\n", + " ElasticAugmentConfig,\n", + " GammaAugmentConfig,\n", + " IntensityAugmentConfig,\n", + " IntensityScaleShiftAugmentConfig,\n", + ")\n", + "\n", + "trainer_config = GunpowderTrainerConfig(\n", + " name=\"cosem_finetune\",\n", + " batch_size=1,\n", + " learning_rate=0.0001,\n", + " num_data_fetchers=20,\n", + " augments=[\n", + " ElasticAugmentConfig(\n", + " control_point_spacing=[100, 100, 100],\n", + " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", + " rotation_interval=(0.0, 1.5707963267948966),\n", + " subsample=8,\n", + " uniform_3d_rotation=True,\n", + " ),\n", + " IntensityAugmentConfig(scale=(0.25, 1.75), shift=(-0.5, 0.35), clip=True),\n", + " GammaAugmentConfig(gamma_range=(0.5, 2.0)),\n", + " IntensityScaleShiftAugmentConfig(scale=2.0, shift=-1.0),\n", + " ],\n", + " snapshot_interval=10000,\n", + " min_masked=0.05,\n", + " clip_raw=True,\n", + ")\n", + "config_store.store_trainer_config(trainer_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Run\n", + " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments import RunConfig\n", + "from dacapo.experiments.run import Run\n", + "\n", + "start_config = None\n", + "\n", + "# Uncomment to start from a pretrained model\n", + "from dacapo.experiments.starts import CosemStartConfig\n", + "\n", + "start_config = CosemStartConfig(\"setup04\", \"1820500\")\n", + "start_config.start_type(start_config).check()\n", + "iterations = 2000\n", + "validation_interval = iterations // 2\n", + "repetitions = 1\n", + "for i in range(repetitions):\n", + " run_config = RunConfig(\n", + " name=\"cosem_distance_run_4nm_finetune\",\n", + " # # NOTE: This is a template for the name of the run. You can customize it as you see fit.\n", + " # name=(\"_\").join(\n", + " # [\n", + " # \"example\",\n", + " # \"scratch\" if start_config is None else \"finetuned\",\n", + " # datasplit_config.name,\n", + " # task_config.name,\n", + " # architecture_config.name,\n", + " # trainer_config.name,\n", + " # ]\n", + " # )\n", + " # + f\"__{i}\",\n", + " datasplit_config=datasplit_config,\n", + " task_config=task_config,\n", + " architecture_config=architecture_config,\n", + " trainer_config=trainer_config,\n", + " num_iterations=iterations,\n", + " validation_interval=validation_interval,\n", + " repetition=i,\n", + " start_config=start_config,\n", + " )\n", + "\n", + " print(run_config.name)\n", + " config_store.store_run_config(run_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Train\n", + " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", + " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.train import train_run\n", + "from dacapo.experiments.run import Run\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "\n", + "run = Run(config_store.retrieve_run_config(\"cosem_distance_run_4nm_finetune\"))\n", + "train_run(run)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements.\n", + " # %%\n", + " from dacapo.validate import validate\n", + " # validate(run_config.name, iterations, num_workers=32)\n", + " validate(\"cosem_distance_run\", 1500, num_workers=10)\n", + " # %%" + ] + } + ], + "nbformat": 4, + "nbformat_minor": 2, + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + } + } +} \ No newline at end of file diff --git a/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.ipynb b/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.ipynb new file mode 100644 index 00000000..59d0ec6b --- /dev/null +++ b/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.ipynb @@ -0,0 +1,548 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " # Dacapo\n", + "\n", + " DaCapo is a framework that allows for easy configuration and execution of established machine learning techniques on arbitrarily large volumes of multi-dimensional images.\n", + "\n", + " DaCapo has 4 major configurable components:\n", + " 1. **dacapo.datasplits.DataSplit**\n", + "\n", + " 2. **dacapo.tasks.Task**\n", + "\n", + " 3. **dacapo.architectures.Architecture**\n", + "\n", + " 4. **dacapo.trainers.Trainer**\n", + "\n", + " These are then combined in a single **dacapo.experiments.Run** that includes your starting point (whether you want to start training from scratch or continue off of a previously trained model) and stopping criterion (the number of iterations you want to train)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Environment setup\n", + " If you have not already done so, you will need to install DaCapo. You can do this by first creating a new environment and then installing DaCapo using pip.\n", + "\n", + " ```bash\n", + " conda create -n dacapo python=3.10\n", + " conda activate dacapo\n", + " ```\n", + "\n", + " Then, you can install DaCapo using pip, via GitHub:\n", + "\n", + " ```bash\n", + " pip install git+https://github.com/janelia-cellmap/dacapo.git\n", + " ```\n", + "\n", + " Or you can clone the repository and install it locally:\n", + "\n", + " ```bash\n", + " git clone https://github.com/janelia-cellmap/dacapo.git\n", + " cd dacapo\n", + " pip install -e .\n", + " ```\n", + "\n", + " Be sure to select this environment in your Jupyter notebook or JupyterLab." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Config Store\n", + " To define where the data goes, create a dacapo.yaml configuration file either in `~/.config/dacapo/dacapo.yaml` or in `./dacapo.yaml`. Here is a template:\n", + "\n", + " ```yaml\n", + " type: files\n", + " runs_base_dir: /path/to/my/data/storage\n", + " ```\n", + " The `runs_base_dir` defines where your on-disk data will be stored. The `type` setting determines the database backend. The default is `files`, which stores the data in a file tree on disk. Alternatively, you can use `mongodb` to store the data in a MongoDB database. To use MongoDB, you will need to provide a `mongodbhost` and `mongodbname` in the configuration file:\n", + "\n", + " ```yaml\n", + " ...\n", + " mongodbhost: mongodb://dbuser:dbpass@dburl:dbport/\n", + " mongodbname: dacapo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First we need to create a config store to store our configurations\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Then let's make sure we have data to train on\n", + "from pathlib import Path\n", + "from dacapo import Options\n", + "from dacapo.examples.utils import get_viewer\n", + "from dacapo.examples.synthetic_source_worker import generate_synthetic_dataset\n", + "from funlib.geometry import Coordinate\n", + "from funlib.persistence import open_ds\n", + "\n", + "options = Options.instance()\n", + "runs_base_dir = options.runs_base_dir\n", + "force = False\n", + "num_workers = 32\n", + "\n", + "# First for training data\n", + "train_data_path = Path(runs_base_dir, \"example_train.zarr\")\n", + "try:\n", + " assert not force\n", + " raw_array = open_ds(str(train_data_path), \"raw\")\n", + " labels_array = open_ds(str(train_data_path), \"labels\")\n", + "except:\n", + " train_shape = Coordinate((512, 512, 512))\n", + " generate_synthetic_dataset(\n", + " train_data_path, shape=train_shape, overwrite=True, num_workers=num_workers\n", + " )\n", + " raw_array = open_ds(str(train_data_path), \"raw\")\n", + " labels_array = open_ds(str(train_data_path), \"labels\")\n", + "\n", + "get_viewer(raw_array, labels_array)\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Then for validation data\n", + "validate_data_path = Path(runs_base_dir, \"example_validate.zarr\")\n", + "try:\n", + " assert not force\n", + " raw_array = open_ds(str(validate_data_path), \"raw\")\n", + " labels_array = open_ds(str(validate_data_path), \"labels\")\n", + "except:\n", + " validate_shape = Coordinate((152, 152, 152)) * 3\n", + " generate_synthetic_dataset(\n", + " validate_data_path,\n", + " shape=validate_shape,\n", + " write_shape=Coordinate((152, 152, 152)),\n", + " overwrite=True,\n", + " num_workers=num_workers,\n", + " )\n", + "\n", + "get_viewer(raw_array, labels_array)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Datasplit\n", + " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", + " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", + " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.datasplits.datasets.arrays import (\n", + " BinarizeArrayConfig,\n", + " ZarrArrayConfig,\n", + " IntensitiesArrayConfig,\n", + ")\n", + "from dacapo.experiments.datasplits import TrainValidateDataSplitConfig\n", + "from dacapo.experiments.datasplits.datasets import RawGTDatasetConfig\n", + "from pathlib import Path\n", + "from dacapo import Options\n", + "\n", + "options = Options.instance()\n", + "runs_base_dir = options.runs_base_dir\n", + "\n", + "datasplit_config = TrainValidateDataSplitConfig(\n", + " name=\"synthetic_datasplit_config\",\n", + " train_configs=[\n", + " RawGTDatasetConfig(\n", + " name=\"train_data\",\n", + " weight=1,\n", + " raw_config=IntensitiesArrayConfig(\n", + " name=\"raw_train_data\",\n", + " source_array_config=ZarrArrayConfig(\n", + " name=\"raw_train_data_uint8\",\n", + " file_name=Path(runs_base_dir, \"example_train.zarr\"),\n", + " dataset=\"raw\",\n", + " ),\n", + " min=0.0,\n", + " max=255.0,\n", + " ),\n", + " gt_config=BinarizeArrayConfig(\n", + " name=\"gt_train_data\",\n", + " source_array_config=ZarrArrayConfig(\n", + " name=\"gt_train_data_zarr\",\n", + " file_name=Path(runs_base_dir, \"example_train.zarr\"),\n", + " dataset=\"labels\",\n", + " ),\n", + " groupings=[(\"labels\", [])],\n", + " ),\n", + " )\n", + " ],\n", + " validate_configs=[\n", + " RawGTDatasetConfig(\n", + " name=\"validate_data\",\n", + " weight=1,\n", + " raw_config=IntensitiesArrayConfig(\n", + " name=\"raw_validate_data\",\n", + " source_array_config=ZarrArrayConfig(\n", + " name=\"raw_validate_data_uint8\",\n", + " file_name=Path(runs_base_dir, \"example_validate.zarr\"),\n", + " dataset=\"raw\",\n", + " ),\n", + " min=0.0,\n", + " max=255.0,\n", + " ),\n", + " gt_config=BinarizeArrayConfig(\n", + " name=\"gt_validate_data\",\n", + " source_array_config=ZarrArrayConfig(\n", + " name=\"gt_validate_data_zarr\",\n", + " file_name=Path(runs_base_dir, \"example_validate.zarr\"),\n", + " dataset=\"labels\",\n", + " ),\n", + " groupings=[(\"labels\", [])],\n", + " ),\n", + " ),\n", + " ],\n", + ")\n", + "\n", + "config_store.store_datasplit_config(datasplit_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Task\n", + " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", + " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", + " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", + " also require specific non-linearities or output formats from your model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.tasks import DistanceTaskConfig\n", + "\n", + "task_config = DistanceTaskConfig(\n", + " name=\"example_distance_task\",\n", + " channels=[\"labels\"],\n", + " clip_distance=80.0,\n", + " tol_distance=80.0,\n", + " scale_factor=160.0,\n", + ")\n", + "config_store.store_task_config(task_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Architecture\n", + "\n", + " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", + "\n", + "architecture_config = CNNectomeUNetConfig(\n", + " name=\"example-unet\",\n", + " input_shape=(172, 172, 172),\n", + " fmaps_out=24,\n", + " fmaps_in=1,\n", + " num_fmaps=12,\n", + " fmap_inc_factor=2,\n", + " downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],\n", + " eval_shape_increase=(72, 72, 72),\n", + ")\n", + "config_store.store_architecture_config(architecture_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Trainer\n", + "\n", + " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", + "from dacapo.experiments.trainers.gp_augments import (\n", + " ElasticAugmentConfig,\n", + " GammaAugmentConfig,\n", + " IntensityAugmentConfig,\n", + " IntensityScaleShiftAugmentConfig,\n", + ")\n", + "\n", + "trainer_config = GunpowderTrainerConfig(\n", + " name=\"default\",\n", + " batch_size=1,\n", + " learning_rate=0.0001,\n", + " num_data_fetchers=20,\n", + " augments=[\n", + " ElasticAugmentConfig(\n", + " control_point_spacing=[100, 100, 100],\n", + " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", + " rotation_interval=(0.0, 1.5707963267948966),\n", + " subsample=8,\n", + " uniform_3d_rotation=True,\n", + " ),\n", + " IntensityAugmentConfig(scale=(0.25, 1.75), shift=(-0.5, 0.35), clip=True),\n", + " GammaAugmentConfig(gamma_range=(0.5, 2.0)),\n", + " IntensityScaleShiftAugmentConfig(scale=2.0, shift=-1.0),\n", + " ],\n", + " snapshot_interval=10000,\n", + " min_masked=0.05,\n", + " clip_raw=True,\n", + ")\n", + "config_store.store_trainer_config(trainer_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Run\n", + " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments import RunConfig\n", + "from dacapo.experiments.run import Run\n", + "\n", + "start_config = None\n", + "\n", + "# Uncomment to start from a pretrained model\n", + "# start_config = StartConfig(\n", + "# \"setup04\",\n", + "# \"best\",\n", + "# )\n", + "\n", + "iterations = 2000\n", + "validation_interval = iterations // 2\n", + "repetitions = 1\n", + "for i in range(repetitions):\n", + " run_config = RunConfig(\n", + " name=\"example_synthetic_distance_run\",\n", + " # # NOTE: This is a template for the name of the run. You can customize it as you see fit.\n", + " # name=(\"_\").join(\n", + " # [\n", + " # \"example\",\n", + " # \"scratch\" if start_config is None else \"finetuned\",\n", + " # datasplit_config.name,\n", + " # task_config.name,\n", + " # architecture_config.name,\n", + " # trainer_config.name,\n", + " # ]\n", + " # )\n", + " # + f\"__{i}\",\n", + " datasplit_config=datasplit_config,\n", + " task_config=task_config,\n", + " architecture_config=architecture_config,\n", + " trainer_config=trainer_config,\n", + " num_iterations=iterations,\n", + " validation_interval=validation_interval,\n", + " repetition=i,\n", + " start_config=start_config,\n", + " )\n", + "\n", + " print(run_config.name)\n", + " config_store.store_run_config(run_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Train\n", + " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", + " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.train import train_run\n", + "from dacapo.experiments.run import Run\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "\n", + "run = Run(config_store.retrieve_run_config(\"example_synthetic_distance_run\"))\n", + "train_run(run)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Validate\n", + " Once you have trained your model, you can validate it on the validation datasets used during training. You can use the `dacapo.validate` function to do this. You can also use the command line interface to validate a run: dacapo validate -r {run_config.name} -i {iteration}\n", + " Generally we setup training to automatically validate at a set interval and the model checkpoints are saved at these intervals." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.validate import validate\n", + "\n", + "validate(run_config.name, iterations, num_workers=16, overwrite=True)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Predict\n", + " Once you have trained and validated your model, you can use it to predict on new data. You can use the `dacapo.predict` function to do this. You can also use the command line interface to predict on a run: dacapo predict -r {run_config.name} -i {iteration} -ic {input_container} -id {input_dataset} -op {output_path}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First let's make some test data\n", + "test_data_path = Path(runs_base_dir, \"example_test.zarr\")\n", + "try:\n", + " assert not force\n", + " raw_array = open_ds(str(test_data_path), \"raw\")\n", + " labels_array = open_ds(str(test_data_path), \"labels\")\n", + "except:\n", + " test_shape = Coordinate((152, 152, 152)) * 5\n", + " generate_synthetic_dataset(\n", + " test_data_path,\n", + " shape=test_shape,\n", + " overwrite=True,\n", + " write_shape=Coordinate((152, 152, 152)),\n", + " num_workers=num_workers,\n", + " )\n", + "\n", + "get_viewer(raw_array, labels_array)\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.predict import predict\n", + "\n", + "predict(\n", + " run_config.name,\n", + " iterations,\n", + " test_data_path,\n", + " \"raw\",\n", + " test_data_path,\n", + " num_workers=32,\n", + " overwrite=True,\n", + " output_dtype=\"float32\",\n", + " output_roi=raw_array.roi,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.validate import validate_run\n", + "\n", + "validate_run(run.name, 50, num_workers=32)\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "" + ] + } + ], + "nbformat": 4, + "nbformat_minor": 2, + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + } + } +} \ No newline at end of file diff --git a/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.py b/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.py new file mode 100644 index 00000000..287fe9f0 --- /dev/null +++ b/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.py @@ -0,0 +1,385 @@ +# %% [markdown] +# # Dacapo +# +# DaCapo is a framework that allows for easy configuration and execution of established machine learning techniques on arbitrarily large volumes of multi-dimensional images. +# +# DaCapo has 4 major configurable components: +# 1. **dacapo.datasplits.DataSplit** +# +# 2. **dacapo.tasks.Task** +# +# 3. **dacapo.architectures.Architecture** +# +# 4. **dacapo.trainers.Trainer** +# +# These are then combined in a single **dacapo.experiments.Run** that includes your starting point (whether you want to start training from scratch or continue off of a previously trained model) and stopping criterion (the number of iterations you want to train). + +# %% [markdown] +# ## Environment setup +# If you have not already done so, you will need to install DaCapo. You can do this by first creating a new environment and then installing DaCapo using pip. +# +# ```bash +# conda create -n dacapo python=3.10 +# conda activate dacapo +# ``` +# +# Then, you can install DaCapo using pip, via GitHub: +# +# ```bash +# pip install git+https://github.com/janelia-cellmap/dacapo.git +# ``` +# +# Or you can clone the repository and install it locally: +# +# ```bash +# git clone https://github.com/janelia-cellmap/dacapo.git +# cd dacapo +# pip install -e . +# ``` +# +# Be sure to select this environment in your Jupyter notebook or JupyterLab. + +# %% [markdown] +# ## Config Store +# To define where the data goes, create a dacapo.yaml configuration file either in `~/.config/dacapo/dacapo.yaml` or in `./dacapo.yaml`. Here is a template: +# +# ```yaml +# type: files +# runs_base_dir: /path/to/my/data/storage +# ``` +# The `runs_base_dir` defines where your on-disk data will be stored. The `type` setting determines the database backend. The default is `files`, which stores the data in a file tree on disk. Alternatively, you can use `mongodb` to store the data in a MongoDB database. To use MongoDB, you will need to provide a `mongodbhost` and `mongodbname` in the configuration file: +# +# ```yaml +# ... +# mongodbhost: mongodb://dbuser:dbpass@dburl:dbport/ +# mongodbname: dacapo + +# %% +# First we need to create a config store to store our configurations +from dacapo.store.create_store import create_config_store + +config_store = create_config_store() + +# %% +# Then let's make sure we have data to train on +from pathlib import Path +from dacapo import Options +from dacapo.examples.utils import get_viewer +from dacapo.examples.synthetic_source_worker import generate_synthetic_dataset +from funlib.geometry import Coordinate +from funlib.persistence import open_ds + +options = Options.instance() +runs_base_dir = options.runs_base_dir +force = False +num_workers = 32 + +# First for training data +train_data_path = Path(runs_base_dir, "example_train.zarr") +try: + assert not force + raw_array = open_ds(str(train_data_path), "raw") + labels_array = open_ds(str(train_data_path), "labels") +except: + train_shape = Coordinate((512, 512, 512)) + generate_synthetic_dataset( + train_data_path, shape=train_shape, overwrite=True, num_workers=num_workers + ) + raw_array = open_ds(str(train_data_path), "raw") + labels_array = open_ds(str(train_data_path), "labels") + +get_viewer(raw_array, labels_array) + +# %% +# Then for validation data +validate_data_path = Path(runs_base_dir, "example_validate.zarr") +try: + assert not force + raw_array = open_ds(str(validate_data_path), "raw") + labels_array = open_ds(str(validate_data_path), "labels") +except: + validate_shape = Coordinate((152, 152, 152)) * 3 + generate_synthetic_dataset( + validate_data_path, + shape=validate_shape, + write_shape=Coordinate((152, 152, 152)), + overwrite=True, + num_workers=num_workers, + ) + +get_viewer(raw_array, labels_array) + +# %% [markdown] +# ## Datasplit +# Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation? + +# We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`. +# NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs. + +# %% +from dacapo.experiments.datasplits.datasets.arrays import ( + BinarizeArrayConfig, + ZarrArrayConfig, + IntensitiesArrayConfig, +) +from dacapo.experiments.datasplits import TrainValidateDataSplitConfig +from dacapo.experiments.datasplits.datasets import RawGTDatasetConfig +from pathlib import Path +from dacapo import Options + +options = Options.instance() +runs_base_dir = options.runs_base_dir + +datasplit_config = TrainValidateDataSplitConfig( + name="synthetic_datasplit_config", + train_configs=[ + RawGTDatasetConfig( + name="train_data", + weight=1, + raw_config=IntensitiesArrayConfig( + name="raw_train_data", + source_array_config=ZarrArrayConfig( + name="raw_train_data_uint8", + file_name=Path(runs_base_dir, "example_train.zarr"), + dataset="raw", + ), + min=0.0, + max=255.0, + ), + gt_config=BinarizeArrayConfig( + name="gt_train_data", + source_array_config=ZarrArrayConfig( + name="gt_train_data_zarr", + file_name=Path(runs_base_dir, "example_train.zarr"), + dataset="labels", + ), + groupings=[("labels", [])], + ), + ) + ], + validate_configs=[ + RawGTDatasetConfig( + name="validate_data", + weight=1, + raw_config=IntensitiesArrayConfig( + name="raw_validate_data", + source_array_config=ZarrArrayConfig( + name="raw_validate_data_uint8", + file_name=Path(runs_base_dir, "example_validate.zarr"), + dataset="raw", + ), + min=0.0, + max=255.0, + ), + gt_config=BinarizeArrayConfig( + name="gt_validate_data", + source_array_config=ZarrArrayConfig( + name="gt_validate_data_zarr", + file_name=Path(runs_base_dir, "example_validate.zarr"), + dataset="labels", + ), + groupings=[("labels", [])], + ), + ), + ], +) + +config_store.store_datasplit_config(datasplit_config) + +# %% [markdown] +# ## Task +# What do you want to learn? An instance segmentation? If so, how? Affinities, +# Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned +# and evaluated with specific loss functions and evaluation metrics. Some tasks may +# also require specific non-linearities or output formats from your model. + +# %% +from dacapo.experiments.tasks import DistanceTaskConfig + +task_config = DistanceTaskConfig( + name="example_distance_task", + channels=["labels"], + clip_distance=80.0, + tol_distance=80.0, + scale_factor=160.0, +) +config_store.store_task_config(task_config) + +# %% [markdown] +# ## Architecture +# +# The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want? + +# %% +from dacapo.experiments.architectures import CNNectomeUNetConfig + +architecture_config = CNNectomeUNetConfig( + name="example-unet", + input_shape=(172, 172, 172), + fmaps_out=24, + fmaps_in=1, + num_fmaps=12, + fmap_inc_factor=2, + downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)], + eval_shape_increase=(72, 72, 72), +) +config_store.store_architecture_config(architecture_config) + +# %% [markdown] +# ## Trainer +# +# How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with. + +# %% +from dacapo.experiments.trainers import GunpowderTrainerConfig +from dacapo.experiments.trainers.gp_augments import ( + ElasticAugmentConfig, + GammaAugmentConfig, + IntensityAugmentConfig, + IntensityScaleShiftAugmentConfig, +) + +trainer_config = GunpowderTrainerConfig( + name="default", + batch_size=1, + learning_rate=0.0001, + num_data_fetchers=20, + augments=[ + ElasticAugmentConfig( + control_point_spacing=[100, 100, 100], + control_point_displacement_sigma=[10.0, 10.0, 10.0], + rotation_interval=(0.0, 1.5707963267948966), + subsample=8, + uniform_3d_rotation=True, + ), + IntensityAugmentConfig(scale=(0.25, 1.75), shift=(-0.5, 0.35), clip=True), + GammaAugmentConfig(gamma_range=(0.5, 2.0)), + IntensityScaleShiftAugmentConfig(scale=2.0, shift=-1.0), + ], + snapshot_interval=10000, + min_masked=0.05, + clip_raw=True, +) +config_store.store_trainer_config(trainer_config) + +# %% [markdown] +# ## Run +# Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum. + +# %% +from dacapo.experiments import RunConfig +from dacapo.experiments.run import Run + +start_config = None + +# Uncomment to start from a pretrained model +# start_config = StartConfig( +# "setup04", +# "best", +# ) + +iterations = 2000 +validation_interval = iterations // 2 +repetitions = 1 +for i in range(repetitions): + run_config = RunConfig( + name="example_synthetic_distance_run", + # # NOTE: This is a template for the name of the run. You can customize it as you see fit. + # name=("_").join( + # [ + # "example", + # "scratch" if start_config is None else "finetuned", + # datasplit_config.name, + # task_config.name, + # architecture_config.name, + # trainer_config.name, + # ] + # ) + # + f"__{i}", + datasplit_config=datasplit_config, + task_config=task_config, + architecture_config=architecture_config, + trainer_config=trainer_config, + num_iterations=iterations, + validation_interval=validation_interval, + repetition=i, + start_config=start_config, + ) + + print(run_config.name) + config_store.store_run_config(run_config) + +# %% [markdown] +# ## Train + +# To train one of the runs, you can either do it by first creating a **Run** directly from the run config +# NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors +# %% +from dacapo.train import train_run +from dacapo.experiments.run import Run +from dacapo.store.create_store import create_config_store + +config_store = create_config_store() + +run = Run(config_store.retrieve_run_config("example_synthetic_distance_run")) +train_run(run) + +# %% [markdown] +# If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements. + + +# %% [markdown] +# ## Validate + +# Once you have trained your model, you can validate it on the validation datasets used during training. You can use the `dacapo.validate` function to do this. You can also use the command line interface to validate a run: dacapo validate -r {run_config.name} -i {iteration} + +# Generally we setup training to automatically validate at a set interval and the model checkpoints are saved at these intervals. + +# %% +from dacapo.validate import validate + +validate(run_config.name, iterations, num_workers=16, overwrite=True) + +# %% [markdown] +# ## Predict +# Once you have trained and validated your model, you can use it to predict on new data. You can use the `dacapo.predict` function to do this. You can also use the command line interface to predict on a run: dacapo predict -r {run_config.name} -i {iteration} -ic {input_container} -id {input_dataset} -op {output_path} + +# %% +# First let's make some test data +test_data_path = Path(runs_base_dir, "example_test.zarr") +try: + assert not force + raw_array = open_ds(str(test_data_path), "raw") + labels_array = open_ds(str(test_data_path), "labels") +except: + test_shape = Coordinate((152, 152, 152)) * 5 + generate_synthetic_dataset( + test_data_path, + shape=test_shape, + overwrite=True, + write_shape=Coordinate((152, 152, 152)), + num_workers=num_workers, + ) + +get_viewer(raw_array, labels_array) + +# %% +from dacapo.predict import predict + +predict( + run_config.name, + iterations, + test_data_path, + "raw", + test_data_path, + num_workers=32, + overwrite=True, + output_dtype="float32", + output_roi=raw_array.roi, +) +# %% +from dacapo.validate import validate_run + +validate_run(run.name, 50, num_workers=32) + +# %% diff --git a/dacapo/examples/distance_task/cosem_example.ipynb b/dacapo/examples/distance_task/cosem_example.ipynb index 916cb7c5..846c15ef 100644 --- a/dacapo/examples/distance_task/cosem_example.ipynb +++ b/dacapo/examples/distance_task/cosem_example.ipynb @@ -252,7 +252,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/dacapo/examples/distance_task/cosem_example_fill_in_the_blank.py b/dacapo/examples/distance_task/cosem_example_fill_in_the_blank.py index 9a7fbaf7..047f006e 100644 --- a/dacapo/examples/distance_task/cosem_example_fill_in_the_blank.py +++ b/dacapo/examples/distance_task/cosem_example_fill_in_the_blank.py @@ -26,8 +26,7 @@ # Create the datasplit, produce the neuroglancer link and store the datasplit datasplit = ... viewer = ... -config_store... - +config_store # %% [markdown] # ## Task # What do you want to learn? An instance segmentation? If so, how? Affinities, @@ -40,9 +39,8 @@ # Create a distance task config where the clip_distance=tol_distance=10x the output resolution, # and scale_factor = 20x the output resolution -task_config = -config_store.... - +task_config = ... +config_store # %% [markdown] # ## Architecture # @@ -97,14 +95,14 @@ # Create a gamma augment config with range .5 to 2 ..., # Create an intensity scale shift agument config to rescale data from the range 0->1 to -1->1 - ..., + ..., ], snapshot_interval=10000, min_masked=0.05, clip_raw=True, ) # Store the trainer -config_store.... +config_store # %% [markdown] # ## Run @@ -128,7 +126,7 @@ run_config = ... print(run_config.name) -config_store... +config_store # %% [markdown] # ## Train @@ -138,6 +136,7 @@ # %% from dacapo.train import train_run from dacapo.experiments.run import Run + # load the run and train it -run = Run(config_store...) +run = Run(config_store) train_run(run) diff --git a/dacapo/examples/synthetic_source_worker.py b/dacapo/examples/synthetic_source_worker.py index cdf21624..fb394cc3 100644 --- a/dacapo/examples/synthetic_source_worker.py +++ b/dacapo/examples/synthetic_source_worker.py @@ -38,6 +38,7 @@ def cli(log_level): fit = "valid" + def generate_synthetic_dataset( output_container: Path | str, raw_output_dataset: str = "raw", diff --git a/dacapo/examples/utils.py b/dacapo/examples/utils.py index 184e594f..e4268590 100644 --- a/dacapo/examples/utils.py +++ b/dacapo/examples/utils.py @@ -17,6 +17,7 @@ import copy import json + def get_viewer( raw_array: gp.Array | Array | ZarrArray, labels_array: gp.Array | Array | ZarrArray, @@ -213,7 +214,8 @@ def update_with_new_validation_if_possible(self): self.most_recent_iteration, validation_dataset.name, ) - + + def add_seg_layer(state, name, data, voxel_size): state.layers[name] = neuroglancer.SegmentationLayer( # segments=[str(i) for i in np.unique(data[data > 0])], # this line will cause all objects to be selected and thus all meshes to be generated...will be slow if lots of high res meshes From 1c756a4341aa63775a751568c9ae32fc9132f657 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Tue, 19 Mar 2024 13:28:01 -0400 Subject: [PATCH 02/20] =?UTF-8?q?fix:=20=F0=9F=90=9B=20Fix=20default=20run?= =?UTF-8?q?s=5Fbase=5Fdir?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dacapo/options.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dacapo/options.py b/dacapo/options.py index 6aac8cfb..cb45cc98 100644 --- a/dacapo/options.py +++ b/dacapo/options.py @@ -22,7 +22,7 @@ class DaCapoConfig: }, ) runs_base_dir: Path = attr.ib( - default=Path(expanduser("~/.dacapo")), + default=Path(expanduser("~/dacapo")), metadata={ "help_text": "The path at DaCapo will use for reading and writing any necessary data. This should be an absolute path." }, @@ -72,8 +72,9 @@ def config_file(cls) -> Optional[Path]: # options files in order of precedence (highest first) options_files += [ Path("./dacapo.yaml"), - Path(Path(__file__).parent.parent, "dacapo.yaml"), + Path("~/dacapo.yaml"), Path(expanduser("~/.config/dacapo/dacapo.yaml")), + Path(Path(__file__).parent.parent, "dacapo.yaml"), ] for path in options_files: if path.exists(): From 8cae4702c579dc5a46849ff05e992187cdf418e5 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Tue, 19 Mar 2024 18:09:42 +0000 Subject: [PATCH 03/20] :art: Format Python code with psf/black --- .../examples/distance_task/cosem_example.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/dacapo/examples/distance_task/cosem_example.py b/dacapo/examples/distance_task/cosem_example.py index 30dc262e..da07f091 100644 --- a/dacapo/examples/distance_task/cosem_example.py +++ b/dacapo/examples/distance_task/cosem_example.py @@ -1,3 +1,61 @@ +# %% [markdown] +# # Dacapo +# +# DaCapo is a framework that allows for easy configuration and execution of established machine learning techniques on arbitrarily large volumes of multi-dimensional images. +# +# DaCapo has 4 major configurable components: +# 1. **dacapo.datasplits.DataSplit** +# +# 2. **dacapo.tasks.Task** +# +# 3. **dacapo.architectures.Architecture** +# +# 4. **dacapo.trainers.Trainer** +# +# These are then combined in a single **dacapo.experiments.Run** that includes your starting point (whether you want to start training from scratch or continue off of a previously trained model) and stopping criterion (the number of iterations you want to train). + +# %% [markdown] +# ## Environment setup +# If you have not already done so, you will need to install DaCapo. You can do this by first creating a new environment and then installing DaCapo using pip. +# +# ```bash +# conda create -n dacapo python=3.10 +# conda activate dacapo +# ``` +# +# Then, you can install DaCapo using pip, via GitHub: +# +# ```bash +# pip install git+https://github.com/janelia-cellmap/dacapo.git +# ``` +# +# Or you can clone the repository and install it locally: +# +# ```bash +# git clone https://github.com/janelia-cellmap/dacapo.git +# cd dacapo +# pip install -e . +# ``` +# +# Be sure to select this environment in your Jupyter notebook or JupyterLab. + +# %% [markdown] +""" +## Config Store +To define where the data goes, create a dacapo.yaml configuration file either in `~/.config/dacapo/dacapo.yaml` or in `./dacapo.yaml`. Here is a template: + +```yaml +type: files +runs_base_dir: /path/to/my/data/storage +``` +The `runs_base_dir` defines where your on-disk data will be stored. The `type` setting determines the database backend. The default is `files`, which stores the data in a file tree on disk. Alternatively, you can use `mongodb` to store the data in a MongoDB database. To use MongoDB, you will need to provide a `mongodbhost` and `mongodbname` in the configuration file: + +```yaml +... +mongodbhost: mongodb://dbuser:dbpass@dburl:dbport/ +mongodbname: dacapo +""" + # %% # First we need to create a config store to store our configurations from dacapo.store.create_store import create_config_store From 34ba7a740acbf6aa549f1085f8360a0f35c73648 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Tue, 19 Mar 2024 15:40:48 -0400 Subject: [PATCH 04/20] =?UTF-8?q?chore:=20=F0=9F=A9=B9=20Make=20prediction?= =?UTF-8?q?/validation=20single=20worker=20to=20fix=20patches?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../distance_task/synthetic_example.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dacapo/examples/distance_task/synthetic_example.py b/dacapo/examples/distance_task/synthetic_example.py index 061e56e8..a8e75498 100644 --- a/dacapo/examples/distance_task/synthetic_example.py +++ b/dacapo/examples/distance_task/synthetic_example.py @@ -137,7 +137,7 @@ datasplit = datasplit_config.datasplit_type(datasplit_config) viewer = datasplit._neuroglancer() -config_store.store_datasplit_config(datasplit_config) +# config_store.store_datasplit_config(datasplit_config) # %% [markdown] # The above datasplit_generator automates a lot of the heavy lifting for configuring data to set up a run. The following shows everything that it is doing, and an equivalent way to set up the datasplit. @@ -232,7 +232,7 @@ tol_distance=80.0, scale_factor=160.0, ) -config_store.store_task_config(task_config) +# config_store.store_task_config(task_config) # %% [markdown] # ## Architecture @@ -252,11 +252,11 @@ downsample_factors=[(2, 2, 2), (2, 2, 2), (2, 2, 2)], eval_shape_increase=(72, 72, 72), ) -try: - config_store.store_architecture_config(architecture_config) -except: - config_store.delete_architecture_config(architecture_config.name) - config_store.store_architecture_config(architecture_config) +# try: +# config_store.store_architecture_config(architecture_config) +# except: +# config_store.delete_architecture_config(architecture_config.name) +# config_store.store_architecture_config(architecture_config) # %% [markdown] # ## Trainer @@ -293,7 +293,7 @@ min_masked=0.05, clip_raw=True, ) -config_store.store_trainer_config(trainer_config) +# config_store.store_trainer_config(trainer_config) # %% [markdown] # ## Run @@ -311,7 +311,7 @@ # "best", # ) -iterations = 2000 +iterations = 200 validation_interval = iterations // 2 repetitions = 1 for i in range(repetitions): @@ -376,7 +376,7 @@ # %% from dacapo.validate import validate -validate(run_config.name, iterations, num_workers=16, overwrite=True) +validate(run_config.name, iterations, num_workers=1, overwrite=True) # %% [markdown] # ## Predict From f9a0bbbd4b86c8e60ea30961e5e5f6cd47ec5174 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Tue, 19 Mar 2024 15:42:57 -0400 Subject: [PATCH 05/20] =?UTF-8?q?chore:=20=F0=9F=8E=A8=20Black=20format.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dacapo/examples/distance_task/cosem_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dacapo/examples/distance_task/cosem_example.py b/dacapo/examples/distance_task/cosem_example.py index 6301df2f..da07f091 100644 --- a/dacapo/examples/distance_task/cosem_example.py +++ b/dacapo/examples/distance_task/cosem_example.py @@ -1,4 +1,3 @@ - # %% [markdown] # # Dacapo # From bc1fdbad9342fdaeb1c1fffb798f43b866ccfdd3 Mon Sep 17 00:00:00 2001 From: Marwan Zouinkhi Date: Tue, 19 Mar 2024 15:54:22 -0400 Subject: [PATCH 06/20] fix registry --- dacapo/store/conversion_hooks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dacapo/store/conversion_hooks.py b/dacapo/store/conversion_hooks.py index 802ec62b..934b4e47 100644 --- a/dacapo/store/conversion_hooks.py +++ b/dacapo/store/conversion_hooks.py @@ -21,6 +21,7 @@ def register_hierarchy_hooks(converter): """Central place to register type hierarchies for conversion.""" converter.register_hierarchy(TaskConfig, cls_fun) + converter.register_hierarchy(StartConfig, cls_fun) converter.register_hierarchy(ArchitectureConfig, cls_fun) converter.register_hierarchy(TrainerConfig, cls_fun) converter.register_hierarchy(AugmentConfig, cls_fun) From 6bb5d47a9d064b6d95c6607de5f4d6e4252d454b Mon Sep 17 00:00:00 2001 From: Marwan Zouinkhi Date: Tue, 19 Mar 2024 16:51:16 -0400 Subject: [PATCH 07/20] head matching --- dacapo/experiments/run.py | 10 ++- dacapo/experiments/starts/cosem_start.py | 30 ++++++++- dacapo/experiments/starts/start.py | 79 ++++++++++++++++++------ 3 files changed, 95 insertions(+), 24 deletions(-) diff --git a/dacapo/experiments/run.py b/dacapo/experiments/run.py index a405bef9..3af70139 100644 --- a/dacapo/experiments/run.py +++ b/dacapo/experiments/run.py @@ -59,8 +59,14 @@ def __init__(self, run_config): if run_config.start_config is not None else None ) - if self.start is not None: - self.start.initialize_weights(self.model) + if self.start is None: + return + else: + if hasattr(run_config.task_config,"channels"): + new_head = run_config.task_config.channels + else: + new_head = None + self.start.initialize_weights(self.model,new_head=new_head) @staticmethod def get_validation_scores(run_config) -> ValidationScores: diff --git a/dacapo/experiments/starts/cosem_start.py b/dacapo/experiments/starts/cosem_start.py index 89bcaad0..d5aff870 100644 --- a/dacapo/experiments/starts/cosem_start.py +++ b/dacapo/experiments/starts/cosem_start.py @@ -6,11 +6,34 @@ logger = logging.getLogger(__file__) - +def get_model_setup(run): + try: + model = cosem.load_model(run) + if hasattr(model, "classes_channels"): + classes_channels = model.classes_channels + else: + classes_channels = None + if hasattr(model, "voxel_size_input"): + voxel_size_input = model.voxel_size_input + else: + voxel_size_input = None + if hasattr(model, "voxel_size_output"): + voxel_size_output = model.voxel_size_output + else: + voxel_size_output = None + return classes_channels, voxel_size_input, voxel_size_output + except Exception as e: + logger.error(f"could not load model setup: {e} - Not a big deal, model will train wiithout head matching") + return None, None, None + class CosemStart(Start): def __init__(self, start_config): super().__init__(start_config) self.name = f"{self.run}/{self.criterion}" + channels, voxel_size_input, voxel_size_output = get_model_setup(self.run) + if voxel_size_input is not None: + logger.warning(f"Starter model resolution: input {voxel_size_input} output {voxel_size_output}, Make sure to set the correct resolution for the input data.") + self.channels = channels def check(self): from dacapo.store.create_store import create_weights_store @@ -25,7 +48,8 @@ def check(self): else: logger.info(f"Checkpoint for {self.name} exists.") - def initialize_weights(self, model): + def initialize_weights(self, model, new_head=None): + self.check() from dacapo.store.create_store import create_weights_store weights_store = create_weights_store() @@ -36,4 +60,4 @@ def initialize_weights(self, model): path = weights_dir / self.criterion cosem.download_checkpoint(self.name, path) weights = weights_store._retrieve_weights(self.run, self.criterion) - super._set_weights(model, weights) + super._set_weights(model, weights, new_head=new_head) diff --git a/dacapo/experiments/starts/start.py b/dacapo/experiments/starts/start.py index fcf3b12a..e24f70cb 100644 --- a/dacapo/experiments/starts/start.py +++ b/dacapo/experiments/starts/start.py @@ -3,6 +3,19 @@ logger = logging.getLogger(__file__) +head_keys = ["prediction_head.weight","prediction_head.bias","chain.1.weight","chain.1.bias"] + +def match_heads(model, head_weights, old_head, new_head ): + for label in new_head: + if label in old_head: + logger.warning(f"matching head for {label}.") + old_index = old_head.index(label) + new_index = new_head.index(label) + for key in head_keys: + if key in model.state_dict().keys(): + n_val = head_weights[key][old_index] + model.state_dict()[key][new_index] = n_val + logger.warning(f"matched head for {label}.") class Start(ABC): """ @@ -32,28 +45,55 @@ def __init__(self, start_config): self.run = start_config.run self.criterion = start_config.criterion - def _set_weights(self, model, weights): + if hasattr(start_config.task_config,"channels"): + self.channels = start_config.task_config.channels + else: + self.channels = None + + def _set_weights(self, model, weights,new_head=None): print(f"loading weights from run {self.run}, criterion: {self.criterion}") - # load the model weights (taken from torch load_state_dict source) try: - model.load_state_dict(weights.model) + if self.channels and new_head: + try: + logger.warning(f"matching heads from run {self.run}, criterion: {self.criterion}") + logger.warning(f"old head: {self.channels}") + logger.warning(f"new head: {new_head}") + head_weights = {} + for key in head_keys: + head_weights[key] = weights.model[key] + for key in head_keys: + weights.model.pop(key, None) + model.load_state_dict(weights.model, strict=False) + model = match_heads(model, head_weights, self.channels, new_head) + except RuntimeError as e: + logger.error(f"ERROR starter matching head: {e}") + logger.warning(f"removing head from run {self.run}, criterion: {self.criterion}") + for key in head_keys: + weights.model.pop(key, None) + model.load_state_dict(weights.model, strict=False) + logger.warning(f"loaded weights in non strict mode from run {self.run}, criterion: {self.criterion}") + else: + try: + model.load_state_dict(weights.model) + except RuntimeError as e: + logger.warning(e) + model_dict = model.state_dict() + pretrained_dict = { + k: v + for k, v in weights.model.items() + if k in model_dict and v.size() == model_dict[k].size() + } + model_dict.update( + pretrained_dict + ) + model.load_state_dict(model_dict) + logger.warning(f"loaded only common layers from weights") except RuntimeError as e: - logger.warning(e) - # if the model is not the same, we can try to load the weights - # of the common layers - model_dict = model.state_dict() - pretrained_dict = { - k: v - for k, v in weights.model.items() - if k in model_dict and v.size() == model_dict[k].size() - } - model_dict.update( - pretrained_dict - ) # update only the existing and matching layers - model.load_state_dict(model_dict) - logger.warning(f"loaded only common layers from weights") + logger.warning(f"ERROR starter: {e}") - def initialize_weights(self, model): + + + def initialize_weights(self, model,new_head=None): """ Retrieves the weights from the dacapo store and load them into the model. @@ -72,4 +112,5 @@ def initialize_weights(self, model): weights_store = create_weights_store() weights = weights_store._retrieve_weights(self.run, self.criterion) - self._set_weights(model, weights) + self._set_weights(model, weights,new_head) + From d8076d5503eb7a4fc3c4e15b8a75660ae8ea4263 Mon Sep 17 00:00:00 2001 From: Marwan Zouinkhi Date: Tue, 19 Mar 2024 17:21:10 -0400 Subject: [PATCH 08/20] fix minor errors --- dacapo/experiments/starts/cosem_start.py | 9 ++- dacapo/experiments/starts/start.py | 88 ++++++++++++------------ 2 files changed, 49 insertions(+), 48 deletions(-) diff --git a/dacapo/experiments/starts/cosem_start.py b/dacapo/experiments/starts/cosem_start.py index d5aff870..99930cee 100644 --- a/dacapo/experiments/starts/cosem_start.py +++ b/dacapo/experiments/starts/cosem_start.py @@ -2,7 +2,7 @@ import logging from cellmap_models import cosem from pathlib import Path -from .start import Start +from .start import Start, _set_weights logger = logging.getLogger(__file__) @@ -28,7 +28,8 @@ def get_model_setup(run): class CosemStart(Start): def __init__(self, start_config): - super().__init__(start_config) + self.run = start_config.run + self.criterion = start_config.criterion self.name = f"{self.run}/{self.criterion}" channels, voxel_size_input, voxel_size_output = get_model_setup(self.run) if voxel_size_input is not None: @@ -60,4 +61,6 @@ def initialize_weights(self, model, new_head=None): path = weights_dir / self.criterion cosem.download_checkpoint(self.name, path) weights = weights_store._retrieve_weights(self.run, self.criterion) - super._set_weights(model, weights, new_head=new_head) + _set_weights(model, weights, self.run, self.criterion, self.channels, new_head) + + diff --git a/dacapo/experiments/starts/start.py b/dacapo/experiments/starts/start.py index e24f70cb..9b76aab5 100644 --- a/dacapo/experiments/starts/start.py +++ b/dacapo/experiments/starts/start.py @@ -17,6 +17,47 @@ def match_heads(model, head_weights, old_head, new_head ): model.state_dict()[key][new_index] = n_val logger.warning(f"matched head for {label}.") +def _set_weights(model, weights, run, criterion, old_head=None, new_head=None): + logger.warning(f"loading weights from run {run}, criterion: {criterion}, old_head {old_head}, new_head: {new_head}") + try: + if old_head and new_head: + try: + logger.warning(f"matching heads from run {run}, criterion: {criterion}") + logger.warning(f"old head: {old_head}") + logger.warning(f"new head: {new_head}") + head_weights = {} + for key in head_keys: + head_weights[key] = weights.model[key] + for key in head_keys: + weights.model.pop(key, None) + model.load_state_dict(weights.model, strict=False) + model = match_heads(model, head_weights, old_head, new_head) + except RuntimeError as e: + logger.error(f"ERROR starter matching head: {e}") + logger.warning(f"removing head from run {run}, criterion: {criterion}") + for key in head_keys: + weights.model.pop(key, None) + model.load_state_dict(weights.model, strict=False) + logger.warning(f"loaded weights in non strict mode from run {run}, criterion: {criterion}") + else: + try: + model.load_state_dict(weights.model) + except RuntimeError as e: + logger.warning(e) + model_dict = model.state_dict() + pretrained_dict = { + k: v + for k, v in weights.model.items() + if k in model_dict and v.size() == model_dict[k].size() + } + model_dict.update( + pretrained_dict + ) + model.load_state_dict(model_dict) + logger.warning(f"loaded only common layers from weights") + except RuntimeError as e: + logger.warning(f"ERROR starter: {e}") + class Start(ABC): """ This class interfaces with the dacapo store to retrieve and load the @@ -48,50 +89,7 @@ def __init__(self, start_config): if hasattr(start_config.task_config,"channels"): self.channels = start_config.task_config.channels else: - self.channels = None - - def _set_weights(self, model, weights,new_head=None): - print(f"loading weights from run {self.run}, criterion: {self.criterion}") - try: - if self.channels and new_head: - try: - logger.warning(f"matching heads from run {self.run}, criterion: {self.criterion}") - logger.warning(f"old head: {self.channels}") - logger.warning(f"new head: {new_head}") - head_weights = {} - for key in head_keys: - head_weights[key] = weights.model[key] - for key in head_keys: - weights.model.pop(key, None) - model.load_state_dict(weights.model, strict=False) - model = match_heads(model, head_weights, self.channels, new_head) - except RuntimeError as e: - logger.error(f"ERROR starter matching head: {e}") - logger.warning(f"removing head from run {self.run}, criterion: {self.criterion}") - for key in head_keys: - weights.model.pop(key, None) - model.load_state_dict(weights.model, strict=False) - logger.warning(f"loaded weights in non strict mode from run {self.run}, criterion: {self.criterion}") - else: - try: - model.load_state_dict(weights.model) - except RuntimeError as e: - logger.warning(e) - model_dict = model.state_dict() - pretrained_dict = { - k: v - for k, v in weights.model.items() - if k in model_dict and v.size() == model_dict[k].size() - } - model_dict.update( - pretrained_dict - ) - model.load_state_dict(model_dict) - logger.warning(f"loaded only common layers from weights") - except RuntimeError as e: - logger.warning(f"ERROR starter: {e}") - - + self.channels = None def initialize_weights(self, model,new_head=None): """ @@ -112,5 +110,5 @@ def initialize_weights(self, model,new_head=None): weights_store = create_weights_store() weights = weights_store._retrieve_weights(self.run, self.criterion) - self._set_weights(model, weights,new_head) + _set_weights(model, weights, self.run, self.criterion, self.channels, new_head) From 8bb5d502298d96be06be04ce8cea782b540c8224 Mon Sep 17 00:00:00 2001 From: Jeff Rhoades <37990507+rhoadesScholar@users.noreply.github.com> Date: Wed, 20 Mar 2024 09:37:06 -0400 Subject: [PATCH 09/20] Update start.py --- dacapo/experiments/starts/start.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dacapo/experiments/starts/start.py b/dacapo/experiments/starts/start.py index 9b76aab5..6204c56f 100644 --- a/dacapo/experiments/starts/start.py +++ b/dacapo/experiments/starts/start.py @@ -30,7 +30,11 @@ def _set_weights(model, weights, run, criterion, old_head=None, new_head=None): head_weights[key] = weights.model[key] for key in head_keys: weights.model.pop(key, None) - model.load_state_dict(weights.model, strict=False) + try: + model.load_state_dict(weights.model, strict=True) + except: + logger.warning("Unable to load model in strict mode. Loading flexibly.") + model.load_state_dict(weights.model, strict=False) model = match_heads(model, head_weights, old_head, new_head) except RuntimeError as e: logger.error(f"ERROR starter matching head: {e}") From 34d11be58c971379f4488922fd9d853174a2bd79 Mon Sep 17 00:00:00 2001 From: Jeff Rhoades <37990507+rhoadesScholar@users.noreply.github.com> Date: Wed, 20 Mar 2024 09:42:23 -0400 Subject: [PATCH 10/20] Update start.py --- dacapo/experiments/starts/start.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dacapo/experiments/starts/start.py b/dacapo/experiments/starts/start.py index 6204c56f..6c162203 100644 --- a/dacapo/experiments/starts/start.py +++ b/dacapo/experiments/starts/start.py @@ -13,8 +13,8 @@ def match_heads(model, head_weights, old_head, new_head ): new_index = new_head.index(label) for key in head_keys: if key in model.state_dict().keys(): - n_val = head_weights[key][old_index] - model.state_dict()[key][new_index] = n_val + new_value = head_weights[key][old_index] + model.state_dict()[key][new_index] = new_value logger.warning(f"matched head for {label}.") def _set_weights(model, weights, run, criterion, old_head=None, new_head=None): From c39f72b7425fd1ca103a8b53df9ee025f7c08108 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 20 Mar 2024 11:24:32 -0400 Subject: [PATCH 11/20] =?UTF-8?q?perf:=20=E2=9A=A1=EF=B8=8F=20Restrict=20l?= =?UTF-8?q?ocal=20prediction=20to=20one=20worker.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dacapo/predict.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/dacapo/predict.py b/dacapo/predict.py index 2d068653..622811a6 100644 --- a/dacapo/predict.py +++ b/dacapo/predict.py @@ -3,9 +3,10 @@ from dacapo.blockwise import run_blockwise import dacapo.blockwise from dacapo.experiments import Run -from dacapo.store.create_store import create_config_store, create_weights_store +from dacapo.store.create_store import create_config_store from dacapo.store.local_array_store import LocalArrayIdentifier from dacapo.experiments.datasplits.datasets.arrays import ZarrArray +from dacapo.compute_context import create_compute_context, LocalTorch from funlib.geometry import Coordinate, Roi import numpy as np @@ -37,7 +38,7 @@ def predict( input_dataset (str): The dataset name of the input array. output_path (LocalArrayIdentifier | str): The path where the prediction array will be stored, or a LocalArryIdentifier for the prediction array. output_roi (Optional[Roi | str], optional): The ROI of the output array. If None, the ROI of the input array will be used. Defaults to None. - num_workers (int, optional): The number of workers to use for blockwise prediction. Defaults to 30. + num_workers (int, optional): The number of workers to use for blockwise prediction. Defaults to 1 for local processing, otherwise 12. output_dtype (np.dtype | str, optional): The dtype of the output array. Defaults to np.uint8. overwrite (bool, optional): If True, the output array will be overwritten if it already exists. Defaults to True. """ @@ -46,15 +47,6 @@ def predict( run_config = config_store.retrieve_run_config(run_name) run = Run(run_config) - # check to see if we can load the weights - weights_store = create_weights_store() - try: - weights_store.retrieve_weights(run_name, iteration) - except FileNotFoundError: - raise ValueError( - f"No weights found for run {run_name} at iteration {iteration}." - ) - # get arrays input_array_identifier = LocalArrayIdentifier(Path(input_container), input_dataset) raw_array = ZarrArray.open_from_array_identifier(input_array_identifier) @@ -73,6 +65,10 @@ def predict( ) # get the model's input and output size + compute_context = create_compute_context() + if isinstance(compute_context, LocalTorch): + num_workers = 1 + model = run.model.eval() input_voxel_size = Coordinate(raw_array.voxel_size) From 5e6b0f4084fbbe70f3b7d3e8487265debc74e332 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 20 Mar 2024 11:52:06 -0400 Subject: [PATCH 12/20] =?UTF-8?q?perf:=20=E2=9A=A1=EF=B8=8F=20Change=20def?= =?UTF-8?q?ault=20validation=20worker=20number.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dacapo/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dacapo/validate.py b/dacapo/validate.py index 027728fd..91820df6 100644 --- a/dacapo/validate.py +++ b/dacapo/validate.py @@ -17,7 +17,7 @@ def validate( run_name: str, iteration: int, - num_workers: int = 30, + num_workers: int = 4, output_dtype: str = "uint8", overwrite: bool = True, ): From 04c05d1f79c6410bfbb6279e625a283e4db8e5a0 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 20 Mar 2024 12:58:33 -0400 Subject: [PATCH 13/20] =?UTF-8?q?feat:=20=F0=9F=9A=80=20Improve=20model=20?= =?UTF-8?q?loading/prediction.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dacapo/blockwise/predict_worker.py | 23 +++++++++++-------- .../experiments/trainers/gunpowder_trainer.py | 5 +++- dacapo/predict.py | 22 +++++++++++------- dacapo/validate.py | 7 +----- 4 files changed, 32 insertions(+), 25 deletions(-) diff --git a/dacapo/blockwise/predict_worker.py b/dacapo/blockwise/predict_worker.py index d8f4ab0a..6c3822cb 100644 --- a/dacapo/blockwise/predict_worker.py +++ b/dacapo/blockwise/predict_worker.py @@ -1,5 +1,6 @@ import sys from pathlib import Path +from typing import Optional import torch from dacapo.experiments.datasplits.datasets.arrays import ZarrArray @@ -46,8 +47,9 @@ def cli(log_level): "-i", "--iteration", required=True, - type=int, + type=Optional[int], help="The training iteration of the model to use for prediction.", + default=None, ) @click.option( "-ic", @@ -62,7 +64,7 @@ def cli(log_level): @click.option("-od", "--output_dataset", required=True, type=str) def start_worker( run_name: str, - iteration: int, + iteration: int | None, input_container: Path | str, input_dataset: str, output_container: Path | str, @@ -76,11 +78,12 @@ def start_worker( run_config = config_store.retrieve_run_config(run_name) run = Run(run_config) - # create weights store - weights_store = create_weights_store() + if iteration is not None: + # create weights store + weights_store = create_weights_store() - # load weights - weights_store.retrieve_weights(run_name, iteration) + # load weights + weights_store.retrieve_weights(run_name, iteration) # get arrays input_array_identifier = LocalArrayIdentifier(Path(input_container), input_dataset) @@ -178,7 +181,7 @@ def start_worker( def spawn_worker( run_name: str, - iteration: int, + iteration: int | None, input_array_identifier: "LocalArrayIdentifier", output_array_identifier: "LocalArrayIdentifier", ): @@ -186,7 +189,7 @@ def spawn_worker( Args: run_name (str): The name of the run to apply. - iteration (int): The training iteration of the model to use for prediction. + iteration (int or None): The training iteration of the model to use for prediction. input_array_identifier (LocalArrayIdentifier): The raw data to predict on. output_array_identifier (LocalArrayIdentifier): The identifier of the prediction array. """ @@ -200,8 +203,6 @@ def spawn_worker( "start-worker", "--run-name", run_name, - "--iteration", - iteration, "--input_container", input_array_identifier.container, "--input_dataset", @@ -211,6 +212,8 @@ def spawn_worker( "--output_dataset", output_array_identifier.dataset, ] + if iteration is not None: + command.extend(["--iteration", str(iteration)]) print("Defining worker with command: ", compute_context.wrap_command(command)) diff --git a/dacapo/experiments/trainers/gunpowder_trainer.py b/dacapo/experiments/trainers/gunpowder_trainer.py index 19acb2d4..74fb9b80 100644 --- a/dacapo/experiments/trainers/gunpowder_trainer.py +++ b/dacapo/experiments/trainers/gunpowder_trainer.py @@ -324,7 +324,10 @@ def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): - self._iter.send(True) + try: + self._iter.send(True) + except TypeError: + self._iter.send(None) pass def can_train(self, datasets) -> bool: diff --git a/dacapo/predict.py b/dacapo/predict.py index 622811a6..d3d53f5a 100644 --- a/dacapo/predict.py +++ b/dacapo/predict.py @@ -19,8 +19,8 @@ def predict( - run_name: str, - iteration: int, + run_name: str | Run, + iteration: int | None, input_container: Path | str, input_dataset: str, output_path: LocalArrayIdentifier | Path | str, @@ -32,8 +32,8 @@ def predict( """Predict with a trained model. Args: - run_name (str): The name of the run to predict with. - iteration (int): The training iteration of the model to use for prediction. + run_name (str or Run): The name of the run to predict with or the Run object. + iteration (int or None): The training iteration of the model to use for prediction. input_container (Path | str): The container of the input array. input_dataset (str): The dataset name of the input array. output_path (LocalArrayIdentifier | str): The path where the prediction array will be stored, or a LocalArryIdentifier for the prediction array. @@ -43,9 +43,13 @@ def predict( overwrite (bool, optional): If True, the output array will be overwritten if it already exists. Defaults to True. """ # retrieving run - config_store = create_config_store() - run_config = config_store.retrieve_run_config(run_name) - run = Run(run_config) + if isinstance(run_name, Run): + run = run_name + run_name = run.name + else: + config_store = create_config_store() + run_config = config_store.retrieve_run_config(run_name) + run = Run(run_config) # get arrays input_array_identifier = LocalArrayIdentifier(Path(input_container), input_dataset) @@ -76,6 +80,8 @@ def predict( input_shape = Coordinate(model.eval_input_shape) input_size = input_voxel_size * input_shape output_size = output_voxel_size * model.compute_output_shape(input_shape)[1] + num_out_channels = model.num_out_channels + del model # calculate input and output rois @@ -111,7 +117,7 @@ def predict( output_array_identifier, raw_array.axes, output_roi, - model.num_out_channels, + num_out_channels, output_voxel_size, output_dtype, overwrite=overwrite, diff --git a/dacapo/validate.py b/dacapo/validate.py index 91820df6..73864372 100644 --- a/dacapo/validate.py +++ b/dacapo/validate.py @@ -40,10 +40,6 @@ def validate( run_name ) - # create weights store and read weights - weights_store = create_weights_store() - weights_store.retrieve_weights(run.name, iteration) - return validate_run( run, iteration, @@ -75,7 +71,6 @@ def validate_run( return None, None # get array and weight store - weights_store = create_weights_store() array_store = create_array_store() iteration_scores = [] @@ -158,7 +153,7 @@ def validate_run( run.name, iteration, validation_dataset.name ) predict( - run.name, + run, iteration, input_container=input_raw_array_identifier.container, input_dataset=input_raw_array_identifier.dataset, From 88d088abaa09c49e918d9f6d5fc8748c65ffb32e Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 20 Mar 2024 14:05:17 -0400 Subject: [PATCH 14/20] =?UTF-8?q?chore:=20=F0=9F=99=88=20Remove=20ipynoteb?= =?UTF-8?q?ook=20checkpoints.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + .../random_source_pipeline-checkpoint.py | 288 --------- .../synthetic_source_worker-checkpoint.py | 216 ------- .../.ipynb_checkpoints/utils-checkpoint.py | 242 -------- .../cosem_example-checkpoint.ipynb | 362 ------------ ...example_fill_in_the_blank-checkpoint.ipynb | 236 -------- .../cosem_finetune_example-checkpoint.ipynb | 271 --------- .../synthetic_example-checkpoint.ipynb | 548 ------------------ .../synthetic_example-checkpoint.py | 385 ------------ 9 files changed, 1 insertion(+), 2548 deletions(-) delete mode 100644 dacapo/examples/.ipynb_checkpoints/random_source_pipeline-checkpoint.py delete mode 100644 dacapo/examples/.ipynb_checkpoints/synthetic_source_worker-checkpoint.py delete mode 100644 dacapo/examples/.ipynb_checkpoints/utils-checkpoint.py delete mode 100644 dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example-checkpoint.ipynb delete mode 100644 dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example_fill_in_the_blank-checkpoint.ipynb delete mode 100644 dacapo/examples/distance_task/.ipynb_checkpoints/cosem_finetune_example-checkpoint.ipynb delete mode 100644 dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.ipynb delete mode 100644 dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.py diff --git a/.gitignore b/.gitignore index c911b587..d14f3fb8 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.hdf *.h5 # *.ipynb +.ipynb_checkpoints/ *.pyc *.egg-info *.dat diff --git a/dacapo/examples/.ipynb_checkpoints/random_source_pipeline-checkpoint.py b/dacapo/examples/.ipynb_checkpoints/random_source_pipeline-checkpoint.py deleted file mode 100644 index 8dbc950b..00000000 --- a/dacapo/examples/.ipynb_checkpoints/random_source_pipeline-checkpoint.py +++ /dev/null @@ -1,288 +0,0 @@ -from typing import Iterable -import gunpowder as gp -import logging -import numpy as np -import random -from scipy.ndimage import ( - distance_transform_edt, - gaussian_filter, -) -from skimage.measure import label as relabel - -logging.basicConfig(level=logging.INFO) - - -class CreatePoints(gp.BatchFilter): - def __init__( - self, - labels, - num_points=(20, 150), - ): - self.labels = labels - self.num_points = num_points - - def process(self, batch, request): - labels = batch[self.labels].data - - num_points = random.randint(*self.num_points) - - z = np.random.randint(1, labels.shape[0] - 1, num_points) - y = np.random.randint(1, labels.shape[1] - 1, num_points) - x = np.random.randint(1, labels.shape[2] - 1, num_points) - - labels[z, y, x] = 1 - - batch[self.labels].data = labels - - -class MakeRaw(gp.BatchFilter): - def __init__( - self, - raw, - labels, - gaussian_noise_args: Iterable = (0.5, 0.1), - gaussian_noise_lim: float = 0.3, - gaussian_blur_args: Iterable = (0.5, 1.5), - membrane_like=True, - membrane_size=3, - inside_value=0.5, - ): - self.raw = raw - self.labels = labels - self.gaussian_noise_args = gaussian_noise_args - self.gaussian_noise_lim = gaussian_noise_lim - self.gaussian_blur_args = gaussian_blur_args - self.membrane_like = membrane_like - self.membrane_size = membrane_size - self.inside_value = inside_value - - def setup(self): - spec = self.spec[self.labels].copy() # type: ignore - spec.dtype = np.float32 - self.provides(self.raw, spec) - - def process(self, batch, request): - labels = batch[self.labels].data - raw: np.ndarray = np.zeros_like(labels, dtype=np.float32) - raw[labels > 0] = 1 - - # generate membrane-like structure - if self.membrane_like: - for id in np.unique(labels): - if id == 0: - continue - raw[distance_transform_edt(labels == id) > self.membrane_size] = self.inside_value # type: ignore - - # now add blur - raw = gaussian_filter(raw, random.uniform(*self.gaussian_blur_args)) - - # now add noise - noise = np.random.normal(*self.gaussian_noise_args, raw.shape) # type: ignore - # normalize to [0, gaussian_noise_lim] - noise -= noise.min() - noise /= noise.max() - noise *= self.gaussian_noise_lim - - raw += noise - raw /= 1 + self.gaussian_noise_lim - raw = 1 - raw # invert - raw.clip(0, 1, out=raw) - - # add to batch - spec = self._spec[self.raw].copy() # type: ignore - spec.roi = request[self.raw].roi - batch[self.raw] = gp.Array(raw, spec) - - -class DilatePoints(gp.BatchFilter): - def __init__(self, labels, dilations=[2, 8]): - self.labels = labels - self.dilations = dilations - - def process(self, batch, request): - labels = batch[self.labels].data - - dilations = random.randint(*self.dilations) - labels = (distance_transform_edt(labels == 0) <= dilations).astype(labels.dtype) # type: ignore - - batch[self.labels].data = labels - - -class RandomDilateLabels(gp.BatchFilter): - def __init__(self, labels, dilations=[2, 8]): - self.labels = labels - self.dilations = dilations - - def process(self, batch, request): - labels = batch[self.labels].data - - new_labels = np.zeros_like(labels) - for id in np.unique(labels): - if id == 0: - continue - dilations = np.random.randint(*self.dilations) - - # # make sure we don't overlap existing labels - new_labels[ - np.logical_or( - labels == id, - np.logical_and( - distance_transform_edt(labels != id) <= dilations, labels == 0 - ), - ) - ] = id # type: ignore - - batch[self.labels].data = new_labels - - -class Relabel(gp.BatchFilter): - def __init__(self, labels, connectivity=1): - self.labels = labels - self.connectivity = connectivity - - def process(self, batch, request): - labels = batch[self.labels].data - - relabeled = relabel(labels, connectivity=self.connectivity).astype(labels.dtype) # type: ignore - - batch[self.labels].data = relabeled - - -class ExpandLabels(gp.BatchFilter): - def __init__(self, labels, background=0): - self.labels = labels - self.background = background - - def process(self, batch, request): - labels_data = batch[self.labels].data - distance = labels_data.shape[0] - - distances, indices = distance_transform_edt( - labels_data == self.background, return_indices=True - ) # type: ignore - - expanded_labels = np.zeros_like(labels_data) - - dilate_mask = distances <= distance - - masked_indices = [ - dimension_indices[dilate_mask] for dimension_indices in indices - ] - - nearest_labels = labels_data[tuple(masked_indices)] - - expanded_labels[dilate_mask] = nearest_labels - - batch[self.labels].data = expanded_labels - - -class ZerosSource(gp.BatchProvider): - def __init__(self, key, spec): - self.key = key - self._spec = {key: spec} - - def setup(self): - pass - - def provide(self, request): - batch = gp.Batch() - - roi = request[self.key].roi - shape = (roi / self._spec[self.key].voxel_size).get_shape() - spec = self._spec[self.key].copy() - spec.roi = roi - - batch.arrays[self.key] = gp.Array(np.zeros(shape, dtype=spec.dtype), spec) - - return batch - - -def random_source_pipeline( - voxel_size=(8, 8, 8), - input_shape=(148, 148, 148), - dtype=np.uint8, - expand_labels=False, - relabel_connectivity=1, - random_dilate=True, - num_points=(20, 150), - gaussian_noise_args=(0, 0.1), - gaussian_blur_args=(0.5, 1.5), - membrane_like=True, - membrane_size=3, - inside_value=0.5, -): - """Create a random source pipeline and batch request for example training. - - Args: - - voxel_size (tuple of int): The size of a voxel in world units. - input_shape (tuple of int): The shape of the input arrays. - dtype (numpy.dtype): The dtype of the label arrays. - expand_labels (bool): Whether to expand the labels into the background. - relabel_connectivity (int): The connectivity used for for relabeling. - random_dilate (bool): Whether to randomly dilate the individual labels. - num_points (tuple of int): The range of the number of points to add to the labels. - gaussian_noise_args (tuple of float): The mean and standard deviation of the Gaussian noise to add to the raw array. - gaussian_blur_args (tuple of float): The mean and standard deviation of the Gaussian blur to apply to the raw array. - membrane_like (bool): Whether to generate a membrane-like structure in the raw array. - membrane_size (int): The width of the membrane-like structure on the outside of the objects. - inside_value (float): The value to set inside the membranes of objects. - - Returns: - - gunpowder.Pipeline: The batch generating Gunpowder pipeline. - gunpowder.BatchRequest: The batch request for the pipeline. - """ - - voxel_size = gp.Coordinate(voxel_size) - input_shape = gp.Coordinate(input_shape) - - labels = gp.ArrayKey("LABELS") - raw = gp.ArrayKey("RAW") - - input_size = input_shape * voxel_size - - request = gp.BatchRequest() - - request.add(labels, input_size) - request.add(raw, input_size) - - source_spec = gp.ArraySpec( - roi=gp.Roi((0, 0, 0), input_size), voxel_size=voxel_size, dtype=dtype - ) - source = ZerosSource(labels, source_spec) - - pipeline = source - - # randomly sample some points and write them into our zeros array as ones - pipeline += CreatePoints(labels, num_points=num_points) - - # grow the boundaries - pipeline += DilatePoints(labels) - - # relabel connected components - pipeline += Relabel(labels, connectivity=relabel_connectivity) - - if expand_labels: - # expand the labels outwards into the background - pipeline += ExpandLabels(labels) - - # relabel ccs again to deal with incorrectly connected background - pipeline += Relabel(labels, connectivity=relabel_connectivity) - - # randomly dilate labels - if random_dilate: - pipeline += RandomDilateLabels(labels) - - # make a raw array - pipeline += MakeRaw( - raw, - labels, - gaussian_noise_args=gaussian_noise_args, - gaussian_blur_args=gaussian_blur_args, - membrane_like=membrane_like, - membrane_size=membrane_size, - inside_value=inside_value, - ) - - return pipeline, request diff --git a/dacapo/examples/.ipynb_checkpoints/synthetic_source_worker-checkpoint.py b/dacapo/examples/.ipynb_checkpoints/synthetic_source_worker-checkpoint.py deleted file mode 100644 index cdf21624..00000000 --- a/dacapo/examples/.ipynb_checkpoints/synthetic_source_worker-checkpoint.py +++ /dev/null @@ -1,216 +0,0 @@ -from dacapo.examples.random_source_pipeline import random_source_pipeline -import gunpowder as gp - - -from pathlib import Path -import sys -from dacapo.experiments.datasplits.datasets.arrays.zarr_array import ZarrArray -from dacapo.store.array_store import LocalArrayIdentifier -from dacapo.compute_context import create_compute_context -import dacapo - -import daisy -from funlib.geometry import Coordinate, Roi - -import numpy as np -import click - -import logging - -logger = logging.getLogger(__file__) - -read_write_conflict: bool = False -fit: str = "shrink" -path = __file__ - - -@click.group() -@click.option( - "--log-level", - type=click.Choice( - ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False - ), - default="INFO", -) -def cli(log_level): - logging.basicConfig(level=getattr(logging, log_level.upper())) - - -fit = "valid" - -def generate_synthetic_dataset( - output_container: Path | str, - raw_output_dataset: str = "raw", - labels_output_dataset: str = "labels", - shape: str | Coordinate = Coordinate((512, 512, 512)), - voxel_size: str | Coordinate = Coordinate((8, 8, 8)), - write_shape: str | Coordinate = Coordinate((256, 256, 256)), - num_workers: int = 16, - overwrite: bool = False, -): - # get ROI from string - if isinstance(voxel_size, str): - _voxel_size = Coordinate([int(v) for v in voxel_size.split(",")]) - else: - _voxel_size = voxel_size - if isinstance(shape, str): - _shape = Coordinate([int(v) for v in shape.split(",")]) - else: - _shape = shape - if isinstance(write_shape, str): - _write_shape = Coordinate([int(v) for v in write_shape.split(",")]) - else: - _write_shape = write_shape - roi = Roi((0, 0, 0), _shape * _voxel_size) - read_roi = write_roi = Roi((0, 0, 0), _write_shape * _voxel_size) - - # get arrays - raw_output_array_identifier = LocalArrayIdentifier( - Path(output_container), raw_output_dataset - ) - raw_output_array = ZarrArray.create_from_array_identifier( - raw_output_array_identifier, - roi=roi, - dtype=np.uint8, - voxel_size=_voxel_size, - num_channels=None, - axes=["z", "y", "x"], - overwrite=overwrite, - write_size=_write_shape * voxel_size, - ) - - labels_output_array_identifier = LocalArrayIdentifier( - Path(output_container), labels_output_dataset - ) - labels_output_array = ZarrArray.create_from_array_identifier( - labels_output_array_identifier, - roi=roi, - dtype=np.uint64, - voxel_size=_voxel_size, - num_channels=None, - axes=["z", "y", "x"], - overwrite=overwrite, - write_size=_write_shape * voxel_size, - ) - - # make daisy blockwise task - dacapo.run_blockwise( - __file__, - roi, - read_roi, - write_roi, - num_workers=num_workers, - raw_output_array_identifier=raw_output_array_identifier, - labels_output_array_identifier=labels_output_array_identifier, - ) - - -@cli.command() -@click.option( - "-oc", "--output_container", required=True, type=click.Path(file_okay=False) -) -@click.option("-rod", "--raw_output_dataset", required=True, type=str) -@click.option("-lod", "--labels_output_dataset", required=True, type=str) -def start_worker( - output_container: Path | str, - raw_output_dataset: str, - labels_output_dataset: str, -): - # get arrays - raw_output_array_identifier = LocalArrayIdentifier( - Path(output_container), raw_output_dataset - ) - raw_output_array = ZarrArray.open_from_array_identifier(raw_output_array_identifier) - - labels_output_array_identifier = LocalArrayIdentifier( - Path(output_container), labels_output_dataset - ) - labels_output_array = ZarrArray.open_from_array_identifier( - labels_output_array_identifier - ) - - # get data generator - - def batch_generator(shape=(128, 128, 128), voxel_size=(8, 8, 8)): - pipeline, request = random_source_pipeline( - input_shape=shape, voxel_size=voxel_size - ) - with gp.build(pipeline): - while True: - yield pipeline.request_batch(request) - - batch_gen = None - - id_offset = None - - # wait for blocks to run pipeline - client = daisy.Client() - - while True: - print("getting block") - with client.acquire_block() as block: - if block is None: - break - - if batch_gen is None or id_offset is None: - size = block.write_roi.get_shape() - voxel_size = raw_output_array.voxel_size - shape = Coordinate(size / voxel_size) - batch_gen = batch_generator( - shape=shape, - voxel_size=voxel_size, - ) - id_offset = np.prod(shape) # number of voxels in the block - batch = next(batch_gen) - raw_array = batch.arrays[gp.ArrayKey("RAW")] - labels_array = batch.arrays[gp.ArrayKey("LABELS")] - - raw_data = raw_array.data - raw_data -= raw_data.min() - raw_data /= raw_data.max() - raw_data *= 255 - raw_data = raw_data.astype(np.uint8) - labels_data = labels_array.data.astype(np.uint64) - labels_data += np.uint64(id_offset * block.block_id[1]) - labels_data[labels_data == np.uint64(id_offset * block.block_id[1])] = 0 - - # write to output array - raw_output_array[block.write_roi] = raw_data - labels_output_array[block.write_roi] = labels_data - - -def spawn_worker( - raw_output_array_identifier: "LocalArrayIdentifier", - labels_output_array_identifier: "LocalArrayIdentifier", -): - """Spawn a worker to generate a synthetic dataset. - - Args: - raw_output_array_identifier (LocalArrayIdentifier): The identifier of the raw output array. - labels_output_array_identifier (LocalArrayIdentifier): The identifier of the labels output array. - """ - compute_context = create_compute_context() - - # Make the command for the worker to run - command = [ - # "python", - sys.executable, - path, - "start-worker", - "--output_container", - raw_output_array_identifier.container, - "--raw_output_dataset", - raw_output_array_identifier.dataset, - "--labels_output_dataset", - labels_output_array_identifier.dataset, - ] - - def run_worker(): - # Run the worker in the given compute context - compute_context.execute(command) - - return run_worker - - -if __name__ == "__main__": - cli() diff --git a/dacapo/examples/.ipynb_checkpoints/utils-checkpoint.py b/dacapo/examples/.ipynb_checkpoints/utils-checkpoint.py deleted file mode 100644 index 184e594f..00000000 --- a/dacapo/examples/.ipynb_checkpoints/utils-checkpoint.py +++ /dev/null @@ -1,242 +0,0 @@ -from typing import Optional -import neuroglancer -from IPython.display import IFrame -import numpy as np -import gunpowder as gp -from funlib.persistence import Array -from dacapo.experiments.datasplits.datasets.arrays import ZarrArray -from funlib.persistence import open_ds -from threading import Thread -import neuroglancer -from neuroglancer.viewer_state import ViewerState -import os -from dacapo.experiments.run import Run -from dacapo.store.create_store import create_array_store -from IPython.display import IFrame -import time -import copy -import json - -def get_viewer( - raw_array: gp.Array | Array | ZarrArray, - labels_array: gp.Array | Array | ZarrArray, - pred_array: Optional[gp.Array | Array | ZarrArray] = None, - pred_labels_array: Optional[gp.Array | Array | ZarrArray] = None, - width: int = 1500, - height: int = 600, -) -> IFrame: - arrays = { - "raw": raw_array, - "labels": labels_array, - } - if pred_array is not None: - arrays["pred"] = pred_array - if pred_labels_array is not None: - arrays["pred_labels"] = pred_labels_array - - data = {} - voxel_sizes = {} - for name, array in arrays.items(): - if hasattr(array, "to_ndarray"): - data[name] = array.to_ndarray() - else: - data[name] = array.data - if hasattr(array, "voxel_size"): - voxel_sizes[name] = array.voxel_size - else: - voxel_sizes[name] = array.spec.voxel_size - - neuroglancer.set_server_bind_address("0.0.0.0") - viewer = neuroglancer.Viewer() - with viewer.txn() as state: - state.showSlices = False - add_seg_layer(state, "labels", data["labels"], voxel_sizes["labels"]) - - add_scalar_layer(state, "raw", data["raw"], voxel_sizes["raw"]) - - if "pred" in data: - add_scalar_layer(state, "pred", data["pred"], voxel_sizes["pred"]) - - if "pred_labels" in data: - add_seg_layer( - state, "pred_labels", data["pred_labels"], voxel_sizes["pred_labels"] - ) - - return IFrame(src=viewer, width=width, height=height) - - -class NeuroglancerRunViewer: - def __init__(self, run: Run): - self.run: Run = run - self.most_recent_iteration = 0 - self.prediction = None - - def updated_neuroglancer_layer(self, layer_name, ds): - source = neuroglancer.LocalVolume( - data=ds.data, - dimensions=neuroglancer.CoordinateSpace( - names=["c", "z", "y", "x"], - units=["", "nm", "nm", "nm"], - scales=[1] + list(ds.voxel_size), - ), - voxel_offset=[0] + list(ds.roi.offset), - ) - new_state = copy.deepcopy(self.viewer.state) - if len(new_state.layers) == 1: - new_state.layers[layer_name] = neuroglancer.ImageLayer(source=source) - else: - # replace name everywhere to preserve state, like what is selected - new_state_str = json.dumps(new_state.to_json()) - new_state_str = new_state_str.replace(new_state.layers[-1].name, layer_name) - new_state = ViewerState(json.loads(new_state_str)) - new_state.layers[layer_name].source = source - - self.viewer.set_state(new_state) - print(self.viewer.state) - - def deprecated_start_neuroglancer(self): - neuroglancer.set_server_bind_address("0.0.0.0") - self.viewer = neuroglancer.Viewer() - - def start_neuroglancer(self): - neuroglancer.set_server_bind_address("0.0.0.0") - self.viewer = neuroglancer.Viewer() - with self.viewer.txn() as state: - state.showSlices = False - - state.layers["raw"] = neuroglancer.ImageLayer( - source=neuroglancer.LocalVolume( - data=self.raw.data, - dimensions=neuroglancer.CoordinateSpace( - names=["z", "y", "x"], - units=["nm", "nm", "nm"], - scales=self.raw.voxel_size, - ), - voxel_offset=self.raw.roi.offset, - ), - ) - return IFrame(src=self.viewer, width=1800, height=900) - - def start(self): - self.array_store = create_array_store() - self.get_datasets() - self.new_validation_checker() - return self.start_neuroglancer() - - def open_from_array_identitifier(self, array_identifier): - if os.path.exists(array_identifier.container / array_identifier.dataset): - return open_ds(str(array_identifier.container), array_identifier.dataset) - else: - return None - - def get_datasets(self): - for validation_dataset in self.run.datasplit.validate: - ( - input_raw_array_identifier, - input_gt_array_identifier, - ) = self.array_store.validation_input_arrays( - self.run.name, validation_dataset.name - ) - - self.raw = self.open_from_array_identitifier(input_raw_array_identifier) - self.gt = self.open_from_array_identitifier(input_gt_array_identifier) - print(self.raw) - - def update_best_info(self, iteration, validation_dataset_name): - prediction_array_identifier = self.array_store.validation_prediction_array( - self.run.name, - iteration, - validation_dataset_name, - ) - self.prediction = self.open_from_array_identitifier(prediction_array_identifier) - self.most_recent_iteration = iteration - - def update_neuroglancer(self, iteration): - self.updated_neuroglancer_layer( - f"prediction at iteration {iteration}", self.prediction - ) - return None - - def update_best(self, iteration, validation_dataset_name): - self.update_best_info(iteration, validation_dataset_name) - self.update_neuroglancer(iteration) - - def new_validation_checker(self): - self.process = Thread(target=self.update_with_new_validation_if_possible) - self.process.daemon = True - self.process.start() - - def update_with_new_validation_if_possible(self): - # Here we are assuming that we are checking the directory .../valdiation_config/prediction - # Ideally we will only have to check for the current best validation - while True: - time.sleep(3) - for validation_dataset in self.run.datasplit.validate: - most_recent_iteration_previous = self.most_recent_iteration - prediction_array_identifier = ( - self.array_store.validation_prediction_array( - self.run.name, - self.most_recent_iteration, - validation_dataset.name, - ) - ) - - container = prediction_array_identifier.container - if os.path.exists(container): - iteration_dirs = [ - name - for name in os.listdir(container) - if os.path.isdir(os.path.join(container, name)) - and name.isnumeric() - ] - - for iteration_dir in iteration_dirs: - if int(iteration_dir) > self.most_recent_iteration: - inference_dir = os.path.join( - container, - iteration_dir, - "validation_config", - "prediction", - ) - if os.path.exists(inference_dir): - # Ignore basic zarr and n5 files - inference_dir_contents = [ - f - for f in os.listdir(inference_dir) - if not f.startswith(".") and not f.endswith(".json") - ] - if inference_dir_contents: - # then it should have at least a chunk writtent out, assume it has all of it written out - self.most_recent_iteration = int(iteration_dir) - if most_recent_iteration_previous != self.most_recent_iteration: - self.update_best( - self.most_recent_iteration, - validation_dataset.name, - ) - -def add_seg_layer(state, name, data, voxel_size): - state.layers[name] = neuroglancer.SegmentationLayer( - # segments=[str(i) for i in np.unique(data[data > 0])], # this line will cause all objects to be selected and thus all meshes to be generated...will be slow if lots of high res meshes - source=neuroglancer.LocalVolume( - data=data, - dimensions=neuroglancer.CoordinateSpace( - names=["z", "y", "x"], - units=["nm", "nm", "nm"], - scales=voxel_size, - ), - ), - segments=np.unique(data[data > 0]), - ) - - -def add_scalar_layer(state, name, data, voxel_size): - state.layers[name] = neuroglancer.ImageLayer( - source=neuroglancer.LocalVolume( - data=data, - dimensions=neuroglancer.CoordinateSpace( - names=["z", "y", "x"], - units=["nm", "nm", "nm"], - scales=voxel_size, - ), - ), - ) diff --git a/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example-checkpoint.ipynb b/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example-checkpoint.ipynb deleted file mode 100644 index 3c974cf1..00000000 --- a/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example-checkpoint.ipynb +++ /dev/null @@ -1,362 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " # Dacapo\n", - "\n", - " DaCapo is a framework that allows for easy configuration and execution of established machine learning techniques on arbitrarily large volumes of multi-dimensional images.\n", - "\n", - " DaCapo has 4 major configurable components:\n", - " 1. **dacapo.datasplits.DataSplit**\n", - "\n", - " 2. **dacapo.tasks.Task**\n", - "\n", - " 3. **dacapo.architectures.Architecture**\n", - "\n", - " 4. **dacapo.trainers.Trainer**\n", - "\n", - " These are then combined in a single **dacapo.experiments.Run** that includes your starting point (whether you want to start training from scratch or continue off of a previously trained model) and stopping criterion (the number of iterations you want to train)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Environment setup\n", - " If you have not already done so, you will need to install DaCapo. You can do this by first creating a new environment and then installing DaCapo using pip.\n", - "\n", - " ```bash\n", - " conda create -n dacapo python=3.10\n", - " conda activate dacapo\n", - " ```\n", - "\n", - " Then, you can install DaCapo using pip, via GitHub:\n", - "\n", - " ```bash\n", - " pip install git+https://github.com/janelia-cellmap/dacapo.git\n", - " ```\n", - "\n", - " Or you can clone the repository and install it locally:\n", - "\n", - " ```bash\n", - " git clone https://github.com/janelia-cellmap/dacapo.git\n", - " cd dacapo\n", - " pip install -e .\n", - " ```\n", - "\n", - " Be sure to select this environment in your Jupyter notebook or JupyterLab." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Config Store\n", - " To define where the data goes, create a dacapo.yaml configuration file either in `~/.config/dacapo/dacapo.yaml` or in `./dacapo.yaml`. Here is a template:\n", - "\n", - " ```yaml\n", - " type: files\n", - " runs_base_dir: /path/to/my/data/storage\n", - " ```\n", - " The `runs_base_dir` defines where your on-disk data will be stored. The `type` setting determines the database backend. The default is `files`, which stores the data in a file tree on disk. Alternatively, you can use `mongodb` to store the data in a MongoDB database. To use MongoDB, you will need to provide a `mongodbhost` and `mongodbname` in the configuration file:\n", - "\n", - " ```yaml\n", - " ...\n", - " mongodbhost: mongodb://dbuser:dbpass@dburl:dbport/\n", - " mongodbname: dacapo" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating FileConfigStore:\n", - "\tpath: /nrs/cellmap/ackermand/dacapo_learnathon/configs\n" - ] - } - ], - "source": [ - "# First we need to create a config store to store our configurations\n", - "from dacapo.store.create_store import create_config_store\n", - "\n", - "config_store = create_config_store()\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Datasplit\n", - " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", - " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", - " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:dacapo.experiments.datasplits.datasplit_generator: No targets specified, using all classes in the dataset as target ['mito'].\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Neuroglancer link: http://h10u28.int.janelia.org:19399/v/a9fea3fb1009ac31987b6fb7d5ecb032fcff77db/\n" - ] - } - ], - "source": [ - "from dacapo.experiments.datasplits import DataSplitGenerator\n", - "from funlib.geometry import Coordinate\n", - "\n", - "input_resolution = Coordinate(8, 8, 8)\n", - "output_resolution = Coordinate(4, 4, 4)\n", - "datasplit_config = DataSplitGenerator.generate_from_csv(\n", - " \"/misc/public/dacapo_learnathon/datasplit_csvs/cosem_example.csv\", input_resolution, output_resolution\n", - ").compute()\n", - "\n", - "datasplit = datasplit_config.datasplit_type(datasplit_config)\n", - "viewer = datasplit._neuroglancer()\n", - "config_store.store_datasplit_config(datasplit_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Task\n", - " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", - " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", - " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", - " also require specific non-linearities or output formats from your model." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.tasks import DistanceTaskConfig\n", - "\n", - "task_config = DistanceTaskConfig(\n", - " name=\"cosem_distance_task_4nm\",\n", - " channels=[\"mito\"],\n", - " clip_distance=40.0,\n", - " tol_distance=40.0,\n", - " scale_factor=80.0,\n", - ")\n", - "config_store.store_task_config(task_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Architecture\n", - "\n", - " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", - "\n", - "architecture_config = CNNectomeUNetConfig(\n", - " name=\"upsample_unet\",\n", - " input_shape=Coordinate(216, 216, 216),\n", - " eval_shape_increase=Coordinate(72, 72, 72),\n", - " fmaps_in=1,\n", - " num_fmaps=12,\n", - " fmaps_out=72,\n", - " fmap_inc_factor=6,\n", - " downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],\n", - " constant_upsample=True,\n", - " upsample_factors=[(2, 2, 2)],\n", - ")\n", - "config_store.store_architecture_config(architecture_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Trainer\n", - "\n", - " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", - "from dacapo.experiments.trainers.gp_augments import (\n", - " ElasticAugmentConfig,\n", - " GammaAugmentConfig,\n", - " IntensityAugmentConfig,\n", - " IntensityScaleShiftAugmentConfig,\n", - ")\n", - "\n", - "trainer_config = GunpowderTrainerConfig(\n", - " name=\"cosem\",\n", - " batch_size=1,\n", - " learning_rate=0.0001,\n", - " num_data_fetchers=20,\n", - " augments=[\n", - " ElasticAugmentConfig(\n", - " control_point_spacing=[100, 100, 100],\n", - " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", - " rotation_interval=(0.0, 1.5707963267948966),\n", - " subsample=8,\n", - " uniform_3d_rotation=True,\n", - " ),\n", - " IntensityAugmentConfig(scale=(0.25, 1.75), shift=(-0.5, 0.35), clip=True),\n", - " GammaAugmentConfig(gamma_range=(0.5, 2.0)),\n", - " IntensityScaleShiftAugmentConfig(scale=2.0, shift=-1.0),\n", - " ],\n", - " snapshot_interval=10000,\n", - " min_masked=0.05,\n", - " clip_raw=True,\n", - ")\n", - "config_store.store_trainer_config(trainer_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Run\n", - " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments import RunConfig\n", - "from dacapo.experiments.run import Run\n", - "\n", - "start_config = None\n", - "\n", - "# Uncomment to start from a pretrained model\n", - "# start_config = StartConfig(\n", - "# \"setup04\",\n", - "# \"best\",\n", - "# )\n", - "\n", - "iterations = 2000\n", - "validation_interval = 50\n", - "repetitions = 1\n", - "for i in range(repetitions):\n", - " run_config = RunConfig(\n", - " name=\"cosem_distance_run_4nm\",\n", - " # # NOTE: This is a template for the name of the run. You can customize it as you see fit.\n", - " # name=(\"_\").join(\n", - " # [\n", - " # \"example\",\n", - " # \"scratch\" if start_config is None else \"finetuned\",\n", - " # datasplit_config.name,\n", - " # task_config.name,\n", - " # architecture_config.name,\n", - " # trainer_config.name,\n", - " # ]\n", - " # )\n", - " # + f\"__{i}\",\n", - " datasplit_config=datasplit_config,\n", - " task_config=task_config,\n", - " architecture_config=architecture_config,\n", - " trainer_config=trainer_config,\n", - " num_iterations=iterations,\n", - " validation_interval=validation_interval,\n", - " repetition=i,\n", - " start_config=start_config,\n", - " )\n", - "\n", - " print(run_config.name)\n", - " config_store.store_run_config(run_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Train\n", - " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", - " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.train import train_run\n", - "from dacapo.experiments.run import Run\n", - "from dacapo.store.create_store import create_config_store\n", - "\n", - "config_store = create_config_store()\n", - "\n", - "run = Run(config_store.retrieve_run_config(\"cosem_distance_run_4nm\"))\n", - "train_run(run)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements.\n", - " # %%\n", - " from dacapo.validate import validate\n", - " # validate(run_config.name, iterations, num_workers=32)\n", - " validate(\"cosem_distance_run\", 1500, num_workers=10)\n", - " # %%" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "DaCapo Learnathon", - "language": "python", - "name": "dacapo_learnathon" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example_fill_in_the_blank-checkpoint.ipynb b/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example_fill_in_the_blank-checkpoint.ipynb deleted file mode 100644 index 83134d93..00000000 --- a/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_example_fill_in_the_blank-checkpoint.ipynb +++ /dev/null @@ -1,236 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# First we need to create a config store to store our configurations\n", - "from dacapo.store.create_store import create_config_store\n", - "\n", - "# create the config store\n", - "config_store = ..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Datasplit\n", - " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", - " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", - " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.datasplits import DataSplitGenerator\n", - "from funlib.geometry import Coordinate\n", - "\n", - "# We will be working with cosem data and we want to work with 8nm isotropic input resolution for the raw data and output at 4 nm resolution.\n", - "# Create these resolutions as Coordinates.\n", - "input_resolution = ...\n", - "output_resolution = ...\n", - "\n", - "# Create the datasplit config using the cosem_example.csv located in the shared learnathon examples\n", - "datasplit_config = ...\n", - "\n", - "# Create the datasplit, produce the neuroglancer link and store the datasplit\n", - "datasplit = ...\n", - "viewer = ...\n", - "config_store...\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Task\n", - " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", - " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", - " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", - " also require specific non-linearities or output formats from your model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.tasks import DistanceTaskConfig\n", - "\n", - "# Create a distance task config where the clip_distance=tol_distance=10x the output resolution,\n", - "# and scale_factor = 20x the output resolution\n", - "task_config = \n", - "config_store....\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Architecture\n", - "\n", - " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", - "\n", - "architecture_config = CNNectomeUNetConfig(\n", - " name=\"upsample_unet\",\n", - " input_shape=Coordinate(216, 216, 216),\n", - " eval_shape_increase=Coordinate(72, 72, 72),\n", - " fmaps_in=1,\n", - " num_fmaps=12,\n", - " fmaps_out=72,\n", - " fmap_inc_factor=6,\n", - " downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],\n", - " constant_upsample=True,\n", - " upsample_factors=[(2, 2, 2)],\n", - ")\n", - "config_store.store_architecture_config(architecture_config)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Trainer\n", - "\n", - " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", - "from dacapo.experiments.trainers.gp_augments import (\n", - " ElasticAugmentConfig,\n", - " GammaAugmentConfig,\n", - " IntensityAugmentConfig,\n", - " IntensityScaleShiftAugmentConfig,\n", - ")\n", - "\n", - "trainer_config = GunpowderTrainerConfig(\n", - " name=\"cosem\",\n", - " batch_size=1,\n", - " learning_rate=0.0001,\n", - " num_data_fetchers=20,\n", - " augments=[\n", - " ElasticAugmentConfig(\n", - " control_point_spacing=[100, 100, 100],\n", - " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", - " rotation_interval=(0.0, 1.5707963267948966),\n", - " subsample=8,\n", - " uniform_3d_rotation=True,\n", - " ),\n", - " # Create an intensity augment config scaling from .25 to 1.25, shifting from -.5 to .35, and with clipping\n", - " ...,\n", - " # Create a gamma augment config with range .5 to 2\n", - " ...,\n", - " # Create an intensity scale shift agument config to rescale data from the range 0->1 to -1->1\n", - " ...,\n", - " ],\n", - " snapshot_interval=10000,\n", - " min_masked=0.05,\n", - " clip_raw=True,\n", - ")\n", - "# Store the trainer\n", - "config_store....\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Run\n", - " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments import RunConfig\n", - "from dacapo.experiments.run import Run\n", - "\n", - "start_config = None\n", - "\n", - "# Uncomment to start from a pretrained model\n", - "# start_config = StartConfig(\n", - "# \"setup04\",\n", - "# \"best\",\n", - "# )\n", - "\n", - "iterations = 2000\n", - "validation_interval = iterations // 2\n", - "# Set up a run using all of the configs and settings you created above\n", - "run_config = ...\n", - "\n", - "print(run_config.name)\n", - "config_store...\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Train\n", - " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", - " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.train import train_run\n", - "from dacapo.experiments.run import Run\n", - "# load the run and train it\n", - "run = Run(config_store...)\n", - "train_run(run)\n", - "" - ] - } - ], - "nbformat": 4, - "nbformat_minor": 2, - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": 3 - } - } -} \ No newline at end of file diff --git a/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_finetune_example-checkpoint.ipynb b/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_finetune_example-checkpoint.ipynb deleted file mode 100644 index cabecc87..00000000 --- a/dacapo/examples/distance_task/.ipynb_checkpoints/cosem_finetune_example-checkpoint.ipynb +++ /dev/null @@ -1,271 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# First we need to create a config store to store our configurations\n", - "from dacapo.store.create_store import create_config_store\n", - "\n", - "config_store = create_config_store()\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Datasplit\n", - " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", - " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", - " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.datasplits import DataSplitGenerator\n", - "from funlib.geometry import Coordinate\n", - "\n", - "input_resolution = Coordinate(8, 8, 8)\n", - "output_resolution = Coordinate(4, 4, 4)\n", - "datasplit_config = DataSplitGenerator.generate_from_csv(\n", - " \"cosem_example.csv\", input_resolution, output_resolution\n", - ").compute()\n", - "\n", - "datasplit = datasplit_config.datasplit_type(datasplit_config)\n", - "viewer = datasplit._neuroglancer()\n", - "config_store.store_datasplit_config(datasplit_config)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Task\n", - " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", - " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", - " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", - " also require specific non-linearities or output formats from your model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.tasks import DistanceTaskConfig\n", - "\n", - "task_config = DistanceTaskConfig(\n", - " name=\"cosem_distance_task_4nm\",\n", - " channels=[\"mito\"],\n", - " clip_distance=40.0,\n", - " tol_distance=40.0,\n", - " scale_factor=80.0,\n", - ")\n", - "config_store.store_task_config(task_config)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Architecture\n", - "\n", - " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", - "\n", - "architecture_config = CNNectomeUNetConfig(\n", - " name=\"upsample_unet\",\n", - " input_shape=Coordinate(216, 216, 216),\n", - " eval_shape_increase=Coordinate(72, 72, 72),\n", - " fmaps_in=1,\n", - " num_fmaps=12,\n", - " fmaps_out=72,\n", - " fmap_inc_factor=6,\n", - " downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],\n", - " constant_upsample=True,\n", - " upsample_factors=[(2, 2, 2)],\n", - ")\n", - "config_store.store_architecture_config(architecture_config)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Trainer\n", - "\n", - " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", - "from dacapo.experiments.trainers.gp_augments import (\n", - " ElasticAugmentConfig,\n", - " GammaAugmentConfig,\n", - " IntensityAugmentConfig,\n", - " IntensityScaleShiftAugmentConfig,\n", - ")\n", - "\n", - "trainer_config = GunpowderTrainerConfig(\n", - " name=\"cosem_finetune\",\n", - " batch_size=1,\n", - " learning_rate=0.0001,\n", - " num_data_fetchers=20,\n", - " augments=[\n", - " ElasticAugmentConfig(\n", - " control_point_spacing=[100, 100, 100],\n", - " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", - " rotation_interval=(0.0, 1.5707963267948966),\n", - " subsample=8,\n", - " uniform_3d_rotation=True,\n", - " ),\n", - " IntensityAugmentConfig(scale=(0.25, 1.75), shift=(-0.5, 0.35), clip=True),\n", - " GammaAugmentConfig(gamma_range=(0.5, 2.0)),\n", - " IntensityScaleShiftAugmentConfig(scale=2.0, shift=-1.0),\n", - " ],\n", - " snapshot_interval=10000,\n", - " min_masked=0.05,\n", - " clip_raw=True,\n", - ")\n", - "config_store.store_trainer_config(trainer_config)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Run\n", - " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments import RunConfig\n", - "from dacapo.experiments.run import Run\n", - "\n", - "start_config = None\n", - "\n", - "# Uncomment to start from a pretrained model\n", - "from dacapo.experiments.starts import CosemStartConfig\n", - "\n", - "start_config = CosemStartConfig(\"setup04\", \"1820500\")\n", - "start_config.start_type(start_config).check()\n", - "iterations = 2000\n", - "validation_interval = iterations // 2\n", - "repetitions = 1\n", - "for i in range(repetitions):\n", - " run_config = RunConfig(\n", - " name=\"cosem_distance_run_4nm_finetune\",\n", - " # # NOTE: This is a template for the name of the run. You can customize it as you see fit.\n", - " # name=(\"_\").join(\n", - " # [\n", - " # \"example\",\n", - " # \"scratch\" if start_config is None else \"finetuned\",\n", - " # datasplit_config.name,\n", - " # task_config.name,\n", - " # architecture_config.name,\n", - " # trainer_config.name,\n", - " # ]\n", - " # )\n", - " # + f\"__{i}\",\n", - " datasplit_config=datasplit_config,\n", - " task_config=task_config,\n", - " architecture_config=architecture_config,\n", - " trainer_config=trainer_config,\n", - " num_iterations=iterations,\n", - " validation_interval=validation_interval,\n", - " repetition=i,\n", - " start_config=start_config,\n", - " )\n", - "\n", - " print(run_config.name)\n", - " config_store.store_run_config(run_config)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Train\n", - " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", - " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.train import train_run\n", - "from dacapo.experiments.run import Run\n", - "from dacapo.store.create_store import create_config_store\n", - "\n", - "config_store = create_config_store()\n", - "\n", - "run = Run(config_store.retrieve_run_config(\"cosem_distance_run_4nm_finetune\"))\n", - "train_run(run)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements.\n", - " # %%\n", - " from dacapo.validate import validate\n", - " # validate(run_config.name, iterations, num_workers=32)\n", - " validate(\"cosem_distance_run\", 1500, num_workers=10)\n", - " # %%" - ] - } - ], - "nbformat": 4, - "nbformat_minor": 2, - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": 3 - } - } -} \ No newline at end of file diff --git a/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.ipynb b/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.ipynb deleted file mode 100644 index 59d0ec6b..00000000 --- a/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.ipynb +++ /dev/null @@ -1,548 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " # Dacapo\n", - "\n", - " DaCapo is a framework that allows for easy configuration and execution of established machine learning techniques on arbitrarily large volumes of multi-dimensional images.\n", - "\n", - " DaCapo has 4 major configurable components:\n", - " 1. **dacapo.datasplits.DataSplit**\n", - "\n", - " 2. **dacapo.tasks.Task**\n", - "\n", - " 3. **dacapo.architectures.Architecture**\n", - "\n", - " 4. **dacapo.trainers.Trainer**\n", - "\n", - " These are then combined in a single **dacapo.experiments.Run** that includes your starting point (whether you want to start training from scratch or continue off of a previously trained model) and stopping criterion (the number of iterations you want to train)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Environment setup\n", - " If you have not already done so, you will need to install DaCapo. You can do this by first creating a new environment and then installing DaCapo using pip.\n", - "\n", - " ```bash\n", - " conda create -n dacapo python=3.10\n", - " conda activate dacapo\n", - " ```\n", - "\n", - " Then, you can install DaCapo using pip, via GitHub:\n", - "\n", - " ```bash\n", - " pip install git+https://github.com/janelia-cellmap/dacapo.git\n", - " ```\n", - "\n", - " Or you can clone the repository and install it locally:\n", - "\n", - " ```bash\n", - " git clone https://github.com/janelia-cellmap/dacapo.git\n", - " cd dacapo\n", - " pip install -e .\n", - " ```\n", - "\n", - " Be sure to select this environment in your Jupyter notebook or JupyterLab." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Config Store\n", - " To define where the data goes, create a dacapo.yaml configuration file either in `~/.config/dacapo/dacapo.yaml` or in `./dacapo.yaml`. Here is a template:\n", - "\n", - " ```yaml\n", - " type: files\n", - " runs_base_dir: /path/to/my/data/storage\n", - " ```\n", - " The `runs_base_dir` defines where your on-disk data will be stored. The `type` setting determines the database backend. The default is `files`, which stores the data in a file tree on disk. Alternatively, you can use `mongodb` to store the data in a MongoDB database. To use MongoDB, you will need to provide a `mongodbhost` and `mongodbname` in the configuration file:\n", - "\n", - " ```yaml\n", - " ...\n", - " mongodbhost: mongodb://dbuser:dbpass@dburl:dbport/\n", - " mongodbname: dacapo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# First we need to create a config store to store our configurations\n", - "from dacapo.store.create_store import create_config_store\n", - "\n", - "config_store = create_config_store()\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Then let's make sure we have data to train on\n", - "from pathlib import Path\n", - "from dacapo import Options\n", - "from dacapo.examples.utils import get_viewer\n", - "from dacapo.examples.synthetic_source_worker import generate_synthetic_dataset\n", - "from funlib.geometry import Coordinate\n", - "from funlib.persistence import open_ds\n", - "\n", - "options = Options.instance()\n", - "runs_base_dir = options.runs_base_dir\n", - "force = False\n", - "num_workers = 32\n", - "\n", - "# First for training data\n", - "train_data_path = Path(runs_base_dir, \"example_train.zarr\")\n", - "try:\n", - " assert not force\n", - " raw_array = open_ds(str(train_data_path), \"raw\")\n", - " labels_array = open_ds(str(train_data_path), \"labels\")\n", - "except:\n", - " train_shape = Coordinate((512, 512, 512))\n", - " generate_synthetic_dataset(\n", - " train_data_path, shape=train_shape, overwrite=True, num_workers=num_workers\n", - " )\n", - " raw_array = open_ds(str(train_data_path), \"raw\")\n", - " labels_array = open_ds(str(train_data_path), \"labels\")\n", - "\n", - "get_viewer(raw_array, labels_array)\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Then for validation data\n", - "validate_data_path = Path(runs_base_dir, \"example_validate.zarr\")\n", - "try:\n", - " assert not force\n", - " raw_array = open_ds(str(validate_data_path), \"raw\")\n", - " labels_array = open_ds(str(validate_data_path), \"labels\")\n", - "except:\n", - " validate_shape = Coordinate((152, 152, 152)) * 3\n", - " generate_synthetic_dataset(\n", - " validate_data_path,\n", - " shape=validate_shape,\n", - " write_shape=Coordinate((152, 152, 152)),\n", - " overwrite=True,\n", - " num_workers=num_workers,\n", - " )\n", - "\n", - "get_viewer(raw_array, labels_array)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Datasplit\n", - " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", - " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", - " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.datasplits.datasets.arrays import (\n", - " BinarizeArrayConfig,\n", - " ZarrArrayConfig,\n", - " IntensitiesArrayConfig,\n", - ")\n", - "from dacapo.experiments.datasplits import TrainValidateDataSplitConfig\n", - "from dacapo.experiments.datasplits.datasets import RawGTDatasetConfig\n", - "from pathlib import Path\n", - "from dacapo import Options\n", - "\n", - "options = Options.instance()\n", - "runs_base_dir = options.runs_base_dir\n", - "\n", - "datasplit_config = TrainValidateDataSplitConfig(\n", - " name=\"synthetic_datasplit_config\",\n", - " train_configs=[\n", - " RawGTDatasetConfig(\n", - " name=\"train_data\",\n", - " weight=1,\n", - " raw_config=IntensitiesArrayConfig(\n", - " name=\"raw_train_data\",\n", - " source_array_config=ZarrArrayConfig(\n", - " name=\"raw_train_data_uint8\",\n", - " file_name=Path(runs_base_dir, \"example_train.zarr\"),\n", - " dataset=\"raw\",\n", - " ),\n", - " min=0.0,\n", - " max=255.0,\n", - " ),\n", - " gt_config=BinarizeArrayConfig(\n", - " name=\"gt_train_data\",\n", - " source_array_config=ZarrArrayConfig(\n", - " name=\"gt_train_data_zarr\",\n", - " file_name=Path(runs_base_dir, \"example_train.zarr\"),\n", - " dataset=\"labels\",\n", - " ),\n", - " groupings=[(\"labels\", [])],\n", - " ),\n", - " )\n", - " ],\n", - " validate_configs=[\n", - " RawGTDatasetConfig(\n", - " name=\"validate_data\",\n", - " weight=1,\n", - " raw_config=IntensitiesArrayConfig(\n", - " name=\"raw_validate_data\",\n", - " source_array_config=ZarrArrayConfig(\n", - " name=\"raw_validate_data_uint8\",\n", - " file_name=Path(runs_base_dir, \"example_validate.zarr\"),\n", - " dataset=\"raw\",\n", - " ),\n", - " min=0.0,\n", - " max=255.0,\n", - " ),\n", - " gt_config=BinarizeArrayConfig(\n", - " name=\"gt_validate_data\",\n", - " source_array_config=ZarrArrayConfig(\n", - " name=\"gt_validate_data_zarr\",\n", - " file_name=Path(runs_base_dir, \"example_validate.zarr\"),\n", - " dataset=\"labels\",\n", - " ),\n", - " groupings=[(\"labels\", [])],\n", - " ),\n", - " ),\n", - " ],\n", - ")\n", - "\n", - "config_store.store_datasplit_config(datasplit_config)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Task\n", - " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", - " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", - " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", - " also require specific non-linearities or output formats from your model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.tasks import DistanceTaskConfig\n", - "\n", - "task_config = DistanceTaskConfig(\n", - " name=\"example_distance_task\",\n", - " channels=[\"labels\"],\n", - " clip_distance=80.0,\n", - " tol_distance=80.0,\n", - " scale_factor=160.0,\n", - ")\n", - "config_store.store_task_config(task_config)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Architecture\n", - "\n", - " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", - "\n", - "architecture_config = CNNectomeUNetConfig(\n", - " name=\"example-unet\",\n", - " input_shape=(172, 172, 172),\n", - " fmaps_out=24,\n", - " fmaps_in=1,\n", - " num_fmaps=12,\n", - " fmap_inc_factor=2,\n", - " downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],\n", - " eval_shape_increase=(72, 72, 72),\n", - ")\n", - "config_store.store_architecture_config(architecture_config)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Trainer\n", - "\n", - " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", - "from dacapo.experiments.trainers.gp_augments import (\n", - " ElasticAugmentConfig,\n", - " GammaAugmentConfig,\n", - " IntensityAugmentConfig,\n", - " IntensityScaleShiftAugmentConfig,\n", - ")\n", - "\n", - "trainer_config = GunpowderTrainerConfig(\n", - " name=\"default\",\n", - " batch_size=1,\n", - " learning_rate=0.0001,\n", - " num_data_fetchers=20,\n", - " augments=[\n", - " ElasticAugmentConfig(\n", - " control_point_spacing=[100, 100, 100],\n", - " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", - " rotation_interval=(0.0, 1.5707963267948966),\n", - " subsample=8,\n", - " uniform_3d_rotation=True,\n", - " ),\n", - " IntensityAugmentConfig(scale=(0.25, 1.75), shift=(-0.5, 0.35), clip=True),\n", - " GammaAugmentConfig(gamma_range=(0.5, 2.0)),\n", - " IntensityScaleShiftAugmentConfig(scale=2.0, shift=-1.0),\n", - " ],\n", - " snapshot_interval=10000,\n", - " min_masked=0.05,\n", - " clip_raw=True,\n", - ")\n", - "config_store.store_trainer_config(trainer_config)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Run\n", - " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.experiments import RunConfig\n", - "from dacapo.experiments.run import Run\n", - "\n", - "start_config = None\n", - "\n", - "# Uncomment to start from a pretrained model\n", - "# start_config = StartConfig(\n", - "# \"setup04\",\n", - "# \"best\",\n", - "# )\n", - "\n", - "iterations = 2000\n", - "validation_interval = iterations // 2\n", - "repetitions = 1\n", - "for i in range(repetitions):\n", - " run_config = RunConfig(\n", - " name=\"example_synthetic_distance_run\",\n", - " # # NOTE: This is a template for the name of the run. You can customize it as you see fit.\n", - " # name=(\"_\").join(\n", - " # [\n", - " # \"example\",\n", - " # \"scratch\" if start_config is None else \"finetuned\",\n", - " # datasplit_config.name,\n", - " # task_config.name,\n", - " # architecture_config.name,\n", - " # trainer_config.name,\n", - " # ]\n", - " # )\n", - " # + f\"__{i}\",\n", - " datasplit_config=datasplit_config,\n", - " task_config=task_config,\n", - " architecture_config=architecture_config,\n", - " trainer_config=trainer_config,\n", - " num_iterations=iterations,\n", - " validation_interval=validation_interval,\n", - " repetition=i,\n", - " start_config=start_config,\n", - " )\n", - "\n", - " print(run_config.name)\n", - " config_store.store_run_config(run_config)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Train\n", - " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", - " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.train import train_run\n", - "from dacapo.experiments.run import Run\n", - "from dacapo.store.create_store import create_config_store\n", - "\n", - "config_store = create_config_store()\n", - "\n", - "run = Run(config_store.retrieve_run_config(\"example_synthetic_distance_run\"))\n", - "train_run(run)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Validate\n", - " Once you have trained your model, you can validate it on the validation datasets used during training. You can use the `dacapo.validate` function to do this. You can also use the command line interface to validate a run: dacapo validate -r {run_config.name} -i {iteration}\n", - " Generally we setup training to automatically validate at a set interval and the model checkpoints are saved at these intervals." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.validate import validate\n", - "\n", - "validate(run_config.name, iterations, num_workers=16, overwrite=True)\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## Predict\n", - " Once you have trained and validated your model, you can use it to predict on new data. You can use the `dacapo.predict` function to do this. You can also use the command line interface to predict on a run: dacapo predict -r {run_config.name} -i {iteration} -ic {input_container} -id {input_dataset} -op {output_path}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# First let's make some test data\n", - "test_data_path = Path(runs_base_dir, \"example_test.zarr\")\n", - "try:\n", - " assert not force\n", - " raw_array = open_ds(str(test_data_path), \"raw\")\n", - " labels_array = open_ds(str(test_data_path), \"labels\")\n", - "except:\n", - " test_shape = Coordinate((152, 152, 152)) * 5\n", - " generate_synthetic_dataset(\n", - " test_data_path,\n", - " shape=test_shape,\n", - " overwrite=True,\n", - " write_shape=Coordinate((152, 152, 152)),\n", - " num_workers=num_workers,\n", - " )\n", - "\n", - "get_viewer(raw_array, labels_array)\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.predict import predict\n", - "\n", - "predict(\n", - " run_config.name,\n", - " iterations,\n", - " test_data_path,\n", - " \"raw\",\n", - " test_data_path,\n", - " num_workers=32,\n", - " overwrite=True,\n", - " output_dtype=\"float32\",\n", - " output_roi=raw_array.roi,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dacapo.validate import validate_run\n", - "\n", - "validate_run(run.name, 50, num_workers=32)\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "" - ] - } - ], - "nbformat": 4, - "nbformat_minor": 2, - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": 3 - } - } -} \ No newline at end of file diff --git a/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.py b/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.py deleted file mode 100644 index 287fe9f0..00000000 --- a/dacapo/examples/distance_task/.ipynb_checkpoints/synthetic_example-checkpoint.py +++ /dev/null @@ -1,385 +0,0 @@ -# %% [markdown] -# # Dacapo -# -# DaCapo is a framework that allows for easy configuration and execution of established machine learning techniques on arbitrarily large volumes of multi-dimensional images. -# -# DaCapo has 4 major configurable components: -# 1. **dacapo.datasplits.DataSplit** -# -# 2. **dacapo.tasks.Task** -# -# 3. **dacapo.architectures.Architecture** -# -# 4. **dacapo.trainers.Trainer** -# -# These are then combined in a single **dacapo.experiments.Run** that includes your starting point (whether you want to start training from scratch or continue off of a previously trained model) and stopping criterion (the number of iterations you want to train). - -# %% [markdown] -# ## Environment setup -# If you have not already done so, you will need to install DaCapo. You can do this by first creating a new environment and then installing DaCapo using pip. -# -# ```bash -# conda create -n dacapo python=3.10 -# conda activate dacapo -# ``` -# -# Then, you can install DaCapo using pip, via GitHub: -# -# ```bash -# pip install git+https://github.com/janelia-cellmap/dacapo.git -# ``` -# -# Or you can clone the repository and install it locally: -# -# ```bash -# git clone https://github.com/janelia-cellmap/dacapo.git -# cd dacapo -# pip install -e . -# ``` -# -# Be sure to select this environment in your Jupyter notebook or JupyterLab. - -# %% [markdown] -# ## Config Store -# To define where the data goes, create a dacapo.yaml configuration file either in `~/.config/dacapo/dacapo.yaml` or in `./dacapo.yaml`. Here is a template: -# -# ```yaml -# type: files -# runs_base_dir: /path/to/my/data/storage -# ``` -# The `runs_base_dir` defines where your on-disk data will be stored. The `type` setting determines the database backend. The default is `files`, which stores the data in a file tree on disk. Alternatively, you can use `mongodb` to store the data in a MongoDB database. To use MongoDB, you will need to provide a `mongodbhost` and `mongodbname` in the configuration file: -# -# ```yaml -# ... -# mongodbhost: mongodb://dbuser:dbpass@dburl:dbport/ -# mongodbname: dacapo - -# %% -# First we need to create a config store to store our configurations -from dacapo.store.create_store import create_config_store - -config_store = create_config_store() - -# %% -# Then let's make sure we have data to train on -from pathlib import Path -from dacapo import Options -from dacapo.examples.utils import get_viewer -from dacapo.examples.synthetic_source_worker import generate_synthetic_dataset -from funlib.geometry import Coordinate -from funlib.persistence import open_ds - -options = Options.instance() -runs_base_dir = options.runs_base_dir -force = False -num_workers = 32 - -# First for training data -train_data_path = Path(runs_base_dir, "example_train.zarr") -try: - assert not force - raw_array = open_ds(str(train_data_path), "raw") - labels_array = open_ds(str(train_data_path), "labels") -except: - train_shape = Coordinate((512, 512, 512)) - generate_synthetic_dataset( - train_data_path, shape=train_shape, overwrite=True, num_workers=num_workers - ) - raw_array = open_ds(str(train_data_path), "raw") - labels_array = open_ds(str(train_data_path), "labels") - -get_viewer(raw_array, labels_array) - -# %% -# Then for validation data -validate_data_path = Path(runs_base_dir, "example_validate.zarr") -try: - assert not force - raw_array = open_ds(str(validate_data_path), "raw") - labels_array = open_ds(str(validate_data_path), "labels") -except: - validate_shape = Coordinate((152, 152, 152)) * 3 - generate_synthetic_dataset( - validate_data_path, - shape=validate_shape, - write_shape=Coordinate((152, 152, 152)), - overwrite=True, - num_workers=num_workers, - ) - -get_viewer(raw_array, labels_array) - -# %% [markdown] -# ## Datasplit -# Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation? - -# We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`. -# NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs. - -# %% -from dacapo.experiments.datasplits.datasets.arrays import ( - BinarizeArrayConfig, - ZarrArrayConfig, - IntensitiesArrayConfig, -) -from dacapo.experiments.datasplits import TrainValidateDataSplitConfig -from dacapo.experiments.datasplits.datasets import RawGTDatasetConfig -from pathlib import Path -from dacapo import Options - -options = Options.instance() -runs_base_dir = options.runs_base_dir - -datasplit_config = TrainValidateDataSplitConfig( - name="synthetic_datasplit_config", - train_configs=[ - RawGTDatasetConfig( - name="train_data", - weight=1, - raw_config=IntensitiesArrayConfig( - name="raw_train_data", - source_array_config=ZarrArrayConfig( - name="raw_train_data_uint8", - file_name=Path(runs_base_dir, "example_train.zarr"), - dataset="raw", - ), - min=0.0, - max=255.0, - ), - gt_config=BinarizeArrayConfig( - name="gt_train_data", - source_array_config=ZarrArrayConfig( - name="gt_train_data_zarr", - file_name=Path(runs_base_dir, "example_train.zarr"), - dataset="labels", - ), - groupings=[("labels", [])], - ), - ) - ], - validate_configs=[ - RawGTDatasetConfig( - name="validate_data", - weight=1, - raw_config=IntensitiesArrayConfig( - name="raw_validate_data", - source_array_config=ZarrArrayConfig( - name="raw_validate_data_uint8", - file_name=Path(runs_base_dir, "example_validate.zarr"), - dataset="raw", - ), - min=0.0, - max=255.0, - ), - gt_config=BinarizeArrayConfig( - name="gt_validate_data", - source_array_config=ZarrArrayConfig( - name="gt_validate_data_zarr", - file_name=Path(runs_base_dir, "example_validate.zarr"), - dataset="labels", - ), - groupings=[("labels", [])], - ), - ), - ], -) - -config_store.store_datasplit_config(datasplit_config) - -# %% [markdown] -# ## Task -# What do you want to learn? An instance segmentation? If so, how? Affinities, -# Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned -# and evaluated with specific loss functions and evaluation metrics. Some tasks may -# also require specific non-linearities or output formats from your model. - -# %% -from dacapo.experiments.tasks import DistanceTaskConfig - -task_config = DistanceTaskConfig( - name="example_distance_task", - channels=["labels"], - clip_distance=80.0, - tol_distance=80.0, - scale_factor=160.0, -) -config_store.store_task_config(task_config) - -# %% [markdown] -# ## Architecture -# -# The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want? - -# %% -from dacapo.experiments.architectures import CNNectomeUNetConfig - -architecture_config = CNNectomeUNetConfig( - name="example-unet", - input_shape=(172, 172, 172), - fmaps_out=24, - fmaps_in=1, - num_fmaps=12, - fmap_inc_factor=2, - downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)], - eval_shape_increase=(72, 72, 72), -) -config_store.store_architecture_config(architecture_config) - -# %% [markdown] -# ## Trainer -# -# How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with. - -# %% -from dacapo.experiments.trainers import GunpowderTrainerConfig -from dacapo.experiments.trainers.gp_augments import ( - ElasticAugmentConfig, - GammaAugmentConfig, - IntensityAugmentConfig, - IntensityScaleShiftAugmentConfig, -) - -trainer_config = GunpowderTrainerConfig( - name="default", - batch_size=1, - learning_rate=0.0001, - num_data_fetchers=20, - augments=[ - ElasticAugmentConfig( - control_point_spacing=[100, 100, 100], - control_point_displacement_sigma=[10.0, 10.0, 10.0], - rotation_interval=(0.0, 1.5707963267948966), - subsample=8, - uniform_3d_rotation=True, - ), - IntensityAugmentConfig(scale=(0.25, 1.75), shift=(-0.5, 0.35), clip=True), - GammaAugmentConfig(gamma_range=(0.5, 2.0)), - IntensityScaleShiftAugmentConfig(scale=2.0, shift=-1.0), - ], - snapshot_interval=10000, - min_masked=0.05, - clip_raw=True, -) -config_store.store_trainer_config(trainer_config) - -# %% [markdown] -# ## Run -# Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum. - -# %% -from dacapo.experiments import RunConfig -from dacapo.experiments.run import Run - -start_config = None - -# Uncomment to start from a pretrained model -# start_config = StartConfig( -# "setup04", -# "best", -# ) - -iterations = 2000 -validation_interval = iterations // 2 -repetitions = 1 -for i in range(repetitions): - run_config = RunConfig( - name="example_synthetic_distance_run", - # # NOTE: This is a template for the name of the run. You can customize it as you see fit. - # name=("_").join( - # [ - # "example", - # "scratch" if start_config is None else "finetuned", - # datasplit_config.name, - # task_config.name, - # architecture_config.name, - # trainer_config.name, - # ] - # ) - # + f"__{i}", - datasplit_config=datasplit_config, - task_config=task_config, - architecture_config=architecture_config, - trainer_config=trainer_config, - num_iterations=iterations, - validation_interval=validation_interval, - repetition=i, - start_config=start_config, - ) - - print(run_config.name) - config_store.store_run_config(run_config) - -# %% [markdown] -# ## Train - -# To train one of the runs, you can either do it by first creating a **Run** directly from the run config -# NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors -# %% -from dacapo.train import train_run -from dacapo.experiments.run import Run -from dacapo.store.create_store import create_config_store - -config_store = create_config_store() - -run = Run(config_store.retrieve_run_config("example_synthetic_distance_run")) -train_run(run) - -# %% [markdown] -# If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements. - - -# %% [markdown] -# ## Validate - -# Once you have trained your model, you can validate it on the validation datasets used during training. You can use the `dacapo.validate` function to do this. You can also use the command line interface to validate a run: dacapo validate -r {run_config.name} -i {iteration} - -# Generally we setup training to automatically validate at a set interval and the model checkpoints are saved at these intervals. - -# %% -from dacapo.validate import validate - -validate(run_config.name, iterations, num_workers=16, overwrite=True) - -# %% [markdown] -# ## Predict -# Once you have trained and validated your model, you can use it to predict on new data. You can use the `dacapo.predict` function to do this. You can also use the command line interface to predict on a run: dacapo predict -r {run_config.name} -i {iteration} -ic {input_container} -id {input_dataset} -op {output_path} - -# %% -# First let's make some test data -test_data_path = Path(runs_base_dir, "example_test.zarr") -try: - assert not force - raw_array = open_ds(str(test_data_path), "raw") - labels_array = open_ds(str(test_data_path), "labels") -except: - test_shape = Coordinate((152, 152, 152)) * 5 - generate_synthetic_dataset( - test_data_path, - shape=test_shape, - overwrite=True, - write_shape=Coordinate((152, 152, 152)), - num_workers=num_workers, - ) - -get_viewer(raw_array, labels_array) - -# %% -from dacapo.predict import predict - -predict( - run_config.name, - iterations, - test_data_path, - "raw", - test_data_path, - num_workers=32, - overwrite=True, - output_dtype="float32", - output_roi=raw_array.roi, -) -# %% -from dacapo.validate import validate_run - -validate_run(run.name, 50, num_workers=32) - -# %% From d470a582ada135db2bbe76eebefcf16d839205f0 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 20 Mar 2024 15:06:57 -0400 Subject: [PATCH 15/20] =?UTF-8?q?fix:=20=F0=9F=90=9B=20Predict=20fix.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dacapo/blockwise/predict_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dacapo/blockwise/predict_worker.py b/dacapo/blockwise/predict_worker.py index 6c3822cb..05c08ae3 100644 --- a/dacapo/blockwise/predict_worker.py +++ b/dacapo/blockwise/predict_worker.py @@ -46,7 +46,6 @@ def cli(log_level): @click.option( "-i", "--iteration", - required=True, type=Optional[int], help="The training iteration of the model to use for prediction.", default=None, From be87111399ec58affbe7372c03053649110e01d8 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 20 Mar 2024 15:49:06 -0400 Subject: [PATCH 16/20] =?UTF-8?q?feat:=20=E2=9C=A8=20Generalize=20get=5Fvi?= =?UTF-8?q?ewer=20util?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dacapo/examples/utils.py | 112 ++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 60 deletions(-) diff --git a/dacapo/examples/utils.py b/dacapo/examples/utils.py index e4268590..617cb512 100644 --- a/dacapo/examples/utils.py +++ b/dacapo/examples/utils.py @@ -19,51 +19,71 @@ def get_viewer( - raw_array: gp.Array | Array | ZarrArray, - labels_array: gp.Array | Array | ZarrArray, - pred_array: Optional[gp.Array | Array | ZarrArray] = None, - pred_labels_array: Optional[gp.Array | Array | ZarrArray] = None, - width: int = 1500, - height: int = 600, -) -> IFrame: - arrays = { - "raw": raw_array, - "labels": labels_array, - } - if pred_array is not None: - arrays["pred"] = pred_array - if pred_labels_array is not None: - arrays["pred_labels"] = pred_labels_array - - data = {} - voxel_sizes = {} - for name, array in arrays.items(): + arrays: dict, width: int = 1500, height: int = 600, headless: bool = True +) -> neuroglancer.Viewer | IFrame: + for name, array_data in arrays.items(): + array = array_data["array"] if hasattr(array, "to_ndarray"): - data[name] = array.to_ndarray() + arrays[name]["array"] = array.to_ndarray() else: - data[name] = array.data + arrays[name]["array"] = array.data if hasattr(array, "voxel_size"): - voxel_sizes[name] = array.voxel_size + arrays[name]["voxel_sizes"] = array.voxel_size else: - voxel_sizes[name] = array.spec.voxel_size + arrays[name]["voxel_sizes"] = array.spec.voxel_size neuroglancer.set_server_bind_address("0.0.0.0") viewer = neuroglancer.Viewer() with viewer.txn() as state: state.showSlices = False - add_seg_layer(state, "labels", data["labels"], voxel_sizes["labels"]) + for name, array_data in arrays.items(): + meshes = "meshes" in array_data and array_data["meshes"] + is_seg = "is_seg" in array_data and array_data["is_seg"] + if is_seg: + add_seg_layer( + state, name, array_data["array"], array_data["voxel_sizes"], meshes + ) + else: + add_scalar_layer( + state, name, array_data["array"], array_data["voxel_sizes"] + ) - add_scalar_layer(state, "raw", data["raw"], voxel_sizes["raw"]) + if headless: + return viewer + else: + return IFrame(src=viewer, width=width, height=height) - if "pred" in data: - add_scalar_layer(state, "pred", data["pred"], voxel_sizes["pred"]) - if "pred_labels" in data: - add_seg_layer( - state, "pred_labels", data["pred_labels"], voxel_sizes["pred_labels"] - ) +def add_seg_layer(state, name, data, voxel_size, meshes=False): + if meshes: + kwargs = {"segments": np.unique(data[data > 0])} + else: + kwargs = {} + state.layers[name] = neuroglancer.SegmentationLayer( + # segments=[str(i) for i in np.unique(data[data > 0])], # this line will cause all objects to be selected and thus all meshes to be generated...will be slow if lots of high res meshes + source=neuroglancer.LocalVolume( + data=data, + dimensions=neuroglancer.CoordinateSpace( + names=["z", "y", "x"], + units=["nm", "nm", "nm"], + scales=voxel_size, + ), + ), + **kwargs, + ) + - return IFrame(src=viewer, width=width, height=height) +def add_scalar_layer(state, name, data, voxel_size): + state.layers[name] = neuroglancer.ImageLayer( + source=neuroglancer.LocalVolume( + data=data, + dimensions=neuroglancer.CoordinateSpace( + names=["z", "y", "x"], + units=["nm", "nm", "nm"], + scales=voxel_size, + ), + ), + ) class NeuroglancerRunViewer: @@ -214,31 +234,3 @@ def update_with_new_validation_if_possible(self): self.most_recent_iteration, validation_dataset.name, ) - - -def add_seg_layer(state, name, data, voxel_size): - state.layers[name] = neuroglancer.SegmentationLayer( - # segments=[str(i) for i in np.unique(data[data > 0])], # this line will cause all objects to be selected and thus all meshes to be generated...will be slow if lots of high res meshes - source=neuroglancer.LocalVolume( - data=data, - dimensions=neuroglancer.CoordinateSpace( - names=["z", "y", "x"], - units=["nm", "nm", "nm"], - scales=voxel_size, - ), - ), - segments=np.unique(data[data > 0]), - ) - - -def add_scalar_layer(state, name, data, voxel_size): - state.layers[name] = neuroglancer.ImageLayer( - source=neuroglancer.LocalVolume( - data=data, - dimensions=neuroglancer.CoordinateSpace( - names=["z", "y", "x"], - units=["nm", "nm", "nm"], - scales=voxel_size, - ), - ), - ) From 52280514737530b25832113314645b7fd440aa9f Mon Sep 17 00:00:00 2001 From: Jeff Rhoades <37990507+rhoadesScholar@users.noreply.github.com> Date: Wed, 20 Mar 2024 16:08:00 -0400 Subject: [PATCH 17/20] Update validate.py --- dacapo/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dacapo/validate.py b/dacapo/validate.py index 73864372..e45457df 100644 --- a/dacapo/validate.py +++ b/dacapo/validate.py @@ -17,7 +17,7 @@ def validate( run_name: str, iteration: int, - num_workers: int = 4, + num_workers: int = 1, output_dtype: str = "uint8", overwrite: bool = True, ): From 7f44619f6461b8dd83fa0814e6cadc70ee8673ba Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 20 Mar 2024 16:56:00 -0400 Subject: [PATCH 18/20] =?UTF-8?q?feat:=20=E2=9C=A8=20Update=20synthetic=20?= =?UTF-8?q?example=20notebook.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../distance_task/synthetic_example.ipynb | 300 +++++++++++++----- .../distance_task/synthetic_example.py | 75 +++-- dacapo/examples/utils.py | 2 +- 3 files changed, 274 insertions(+), 103 deletions(-) diff --git a/dacapo/examples/distance_task/synthetic_example.ipynb b/dacapo/examples/distance_task/synthetic_example.ipynb index 5673ea26..3c8d0cfe 100644 --- a/dacapo/examples/distance_task/synthetic_example.ipynb +++ b/dacapo/examples/distance_task/synthetic_example.ipynb @@ -70,33 +70,57 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating FileConfigStore:\n", + "\tpath: /nrs/cellmap/rhoadesj/dacapo_runs/configs\n" + ] + } + ], "source": [ "# First we need to create a config store to store our configurations\n", "from dacapo.store.create_store import create_config_store\n", "\n", "config_store = create_config_store()\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Then let's make sure we have data to train on. If this is already provided, you can skip to the Datasplit section." + "\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "# Then let's make sure we have data to train on. If this is already provided, you can skip to the Datasplit section.\n", + "\n", "from pathlib import Path\n", "from dacapo import Options\n", "from dacapo.examples.utils import get_viewer\n", @@ -127,15 +151,41 @@ " raw_array = open_ds(str(train_data_path), \"raw\")\n", " labels_array = open_ds(str(train_data_path), \"labels\")\n", "\n", - "get_viewer(raw_array, labels_array)\n", - "" + "arrays = {\n", + " \"raw\": {\"array\": raw_array},\n", + " \"labels\": {\"array\": labels_array, \"meshes\": True},\n", + "}\n", + "get_viewer(arrays, headless=False)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Then for validation data\n", "validate_data_path = Path(runs_base_dir, \"example_validate.zarr\")\n", @@ -152,9 +202,63 @@ " overwrite=True,\n", " num_workers=num_workers,\n", " )\n", + "arrays = {\n", + " \"raw\": {\"array\": raw_array},\n", + " \"labels\": {\"array\": labels_array, \"meshes\": True},\n", + "}\n", + "get_viewer(arrays, headless=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Then let's make some test data\n", + "test_data_path = Path(runs_base_dir, \"example_test.zarr\")\n", + "try:\n", + " assert not force_example_creation\n", + " raw_array = open_ds(str(test_data_path), \"raw\")\n", + " labels_array = open_ds(str(test_data_path), \"labels\")\n", + "except:\n", + " test_shape = Coordinate((152, 152, 152)) * 5\n", + " generate_synthetic_dataset(\n", + " test_data_path,\n", + " shape=test_shape,\n", + " overwrite=True,\n", + " write_shape=Coordinate((152, 152, 152)),\n", + " num_workers=num_workers,\n", + " )\n", "\n", - "get_viewer(raw_array, labels_array)\n", - "" + "arrays = {\n", + " \"raw\": {\"array\": raw_array},\n", + " \"labels\": {\"array\": labels_array, \"meshes\": True},\n", + "}\n", + "get_viewer(arrays, headless=False)\n" ] }, { @@ -169,9 +273,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:dacapo.experiments.datasplits.datasplit_generator: No targets specified, using all classes in the dataset as target ['labels'].\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Neuroglancer link: http://rhoadesj-ws1.hhmi.org:32941/v/d893eb501d1df982c9931ec02a68ee97a11888a5/\n" + ] + } + ], "source": [ "from pathlib import Path\n", "from dacapo.experiments.datasplits import DataSplitGenerator\n", @@ -187,8 +306,7 @@ "\n", "datasplit = datasplit_config.datasplit_type(datasplit_config)\n", "viewer = datasplit._neuroglancer()\n", - "config_store.store_datasplit_config(datasplit_config)\n", - "" + "config_store.store_datasplit_config(datasplit_config)\n" ] }, { @@ -283,7 +401,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -296,8 +414,7 @@ " tol_distance=80.0,\n", " scale_factor=160.0,\n", ")\n", - "config_store.store_task_config(task_config)\n", - "" + "config_store.store_task_config(task_config)\n" ] }, { @@ -311,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -331,8 +448,7 @@ " config_store.store_architecture_config(architecture_config)\n", "except:\n", " config_store.delete_architecture_config(architecture_config.name)\n", - " config_store.store_architecture_config(architecture_config)\n", - "" + " config_store.store_architecture_config(architecture_config)\n" ] }, { @@ -346,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -379,8 +495,7 @@ " min_masked=0.05,\n", " clip_raw=True,\n", ")\n", - "config_store.store_trainer_config(trainer_config)\n", - "" + "config_store.store_trainer_config(trainer_config)\n" ] }, { @@ -393,9 +508,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "example_synthetic_distance_run\n" + ] + } + ], "source": [ "from dacapo.experiments import RunConfig\n", "from dacapo.experiments.run import Run\n", @@ -409,7 +532,7 @@ "# )\n", "\n", "iterations = 2000\n", - "validation_interval = iterations // 2\n", + "validation_interval = 200 # iterations // 2\n", "repetitions = 1\n", "for i in range(repetitions):\n", " run_config = RunConfig(\n", @@ -442,8 +565,7 @@ " except:\n", " config_store.delete_run_config(run_config.name)\n", " config_store.store_run_config(run_config)\n", - "\n", - "" + "\n" ] }, { @@ -457,19 +579,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating FileConfigStore:\n", + "\tpath: /nrs/cellmap/rhoadesj/dacapo_runs/configs\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from dacapo.train import train_run\n", "from dacapo.experiments.run import Run\n", "from dacapo.store.create_store import create_config_store\n", "\n", "config_store = create_config_store()\n", - "\n", "run = Run(config_store.retrieve_run_config(run_config.name))\n", - "train_run(run)\n", - "" + "\n", + "# Now run\n", + "train_run(run)" ] }, { @@ -496,8 +650,7 @@ "source": [ "from dacapo.validate import validate\n", "\n", - "validate(run_config.name, iterations, num_workers=16, overwrite=True)\n", - "" + "validate(run_config.name, iterations, num_workers=1, overwrite=True)\n" ] }, { @@ -508,32 +661,6 @@ " Once you have trained and validated your model, you can use it to predict on new data. You can use the `dacapo.predict` function to do this. You can also use the command line interface to predict on a run: dacapo predict -r {run_config.name} -i {iteration} -ic {input_container} -id {input_dataset} -op {output_path}" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# First let's make some test data\n", - "test_data_path = Path(runs_base_dir, \"example_test.zarr\")\n", - "try:\n", - " assert not force_example_creation\n", - " raw_array = open_ds(str(test_data_path), \"raw\")\n", - " labels_array = open_ds(str(test_data_path), \"labels\")\n", - "except:\n", - " test_shape = Coordinate((152, 152, 152)) * 5\n", - " generate_synthetic_dataset(\n", - " test_data_path,\n", - " shape=test_shape,\n", - " overwrite=True,\n", - " write_shape=Coordinate((152, 152, 152)),\n", - " num_workers=num_workers,\n", - " )\n", - "\n", - "get_viewer(raw_array, labels_array)\n", - "" - ] - }, { "cell_type": "code", "execution_count": null, @@ -542,6 +669,8 @@ "source": [ "from dacapo.predict import predict\n", "\n", + "# test_data_path = Path(runs_base_dir, \"example_test.zarr\")\n", + "\n", "predict(\n", " run_config.name,\n", " iterations,\n", @@ -553,13 +682,26 @@ " output_dtype=\"float32\",\n", " output_roi=raw_array.roi,\n", ")\n", - "" + "\n", + "raw_array = open_ds(str(test_data_path), \"raw\")\n", + "pred_array = open_ds(str(test_data_path), \"predictions\")\n", + "gt_array = open_ds(str(test_data_path), \"labels\")\n", + "\n", + "arrays = {\n", + " \"raw\": {\"array\": raw_array},\n", + " \"labels\": {\"array\": gt_array, \"meshes\": True},\n", + " \"predictions\": {\"array\": pred_array},\n", + "}\n", + "get_viewer(arrays, headless=False) " ] } ], - "nbformat": 4, - "nbformat_minor": 2, "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -570,7 +712,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": 3 + "version": "3.10.13" } - } -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/dacapo/examples/distance_task/synthetic_example.py b/dacapo/examples/distance_task/synthetic_example.py index a8e75498..f89f66fb 100644 --- a/dacapo/examples/distance_task/synthetic_example.py +++ b/dacapo/examples/distance_task/synthetic_example.py @@ -93,8 +93,11 @@ ) raw_array = open_ds(str(train_data_path), "raw") labels_array = open_ds(str(train_data_path), "labels") - -get_viewer(raw_array, labels_array) +arrays = { + "raw": {"array": raw_array}, + "labels": {"array": labels_array, "meshes": True}, +} +get_viewer(arrays, headless=False) # %% # Then for validation data @@ -104,7 +107,7 @@ raw_array = open_ds(str(validate_data_path), "raw") labels_array = open_ds(str(validate_data_path), "labels") except: - validate_shape = Coordinate((152, 152, 152)) * 3 + validate_shape = Coordinate((152, 152, 152)) * 1 generate_synthetic_dataset( validate_data_path, shape=validate_shape, @@ -113,7 +116,34 @@ num_workers=num_workers, ) -get_viewer(raw_array, labels_array) +arrays = { + "raw": {"array": raw_array}, + "labels": {"array": labels_array, "meshes": True}, +} +get_viewer(arrays, headless=False) + +# %% +# Then let's make some test data +test_data_path = Path(runs_base_dir, "example_test.zarr") +try: + assert not force_example_creation + raw_array = open_ds(str(test_data_path), "raw") + labels_array = open_ds(str(test_data_path), "labels") +except: + test_shape = Coordinate((152, 152, 152)) * 3 + generate_synthetic_dataset( + test_data_path, + shape=test_shape, + overwrite=True, + write_shape=Coordinate((152, 152, 152)), + num_workers=num_workers, + ) + +arrays = { + "raw": {"array": raw_array}, + "labels": {"array": labels_array, "meshes": True}, +} +get_viewer(arrays, headless=False) # %% [markdown] # ## Datasplit @@ -356,10 +386,15 @@ from dacapo.train import train_run from dacapo.experiments.run import Run from dacapo.store.create_store import create_config_store +from dacapo.examples.utils import NeuroglancerRunViewer config_store = create_config_store() - run = Run(config_store.retrieve_run_config(run_config.name)) + +# Visualize as we go +run_viewer = NeuroglancerRunViewer(run) +run_viewer.start() +# %% train_run(run) # %% [markdown] @@ -382,25 +417,6 @@ # ## Predict # Once you have trained and validated your model, you can use it to predict on new data. You can use the `dacapo.predict` function to do this. You can also use the command line interface to predict on a run: dacapo predict -r {run_config.name} -i {iteration} -ic {input_container} -id {input_dataset} -op {output_path} -# %% -# First let's make some test data -test_data_path = Path(runs_base_dir, "example_test.zarr") -try: - assert not force_example_creation - raw_array = open_ds(str(test_data_path), "raw") - labels_array = open_ds(str(test_data_path), "labels") -except: - test_shape = Coordinate((152, 152, 152)) * 5 - generate_synthetic_dataset( - test_data_path, - shape=test_shape, - overwrite=True, - write_shape=Coordinate((152, 152, 152)), - num_workers=num_workers, - ) - -get_viewer(raw_array, labels_array) - # %% from dacapo.predict import predict @@ -416,3 +432,14 @@ output_dtype="float32", output_roi=raw_array.roi, ) + +raw_array = open_ds(str(test_data_path), "raw") +pred_array = open_ds(str(test_data_path), "predictions") +gt_array = open_ds(str(test_data_path), "labels") + +arrays = { + "raw": {"array": raw_array}, + "labels": {"array": gt_array, "meshes": True}, + "predictions": {"array": pred_array}, +} +get_viewer(arrays, headless=False) diff --git a/dacapo/examples/utils.py b/dacapo/examples/utils.py index 617cb512..82c9aa2d 100644 --- a/dacapo/examples/utils.py +++ b/dacapo/examples/utils.py @@ -38,7 +38,7 @@ def get_viewer( state.showSlices = False for name, array_data in arrays.items(): meshes = "meshes" in array_data and array_data["meshes"] - is_seg = "is_seg" in array_data and array_data["is_seg"] + is_seg = meshes or ("is_seg" in array_data and array_data["is_seg"]) if is_seg: add_seg_layer( state, name, array_data["array"], array_data["voxel_sizes"], meshes From 14c42dbb867bafa701001e233e04857b308aad73 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 20 Mar 2024 17:32:49 -0400 Subject: [PATCH 19/20] =?UTF-8?q?fix:=20=F0=9F=90=9B=20Fix=20predict=5Fwor?= =?UTF-8?q?ker?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dacapo/blockwise/predict_worker.py | 2 +- .../cosem_finetune_example.ipynb | 30 +- .../distance_task/synthetic_example.ipynb | 546 ++++++++++++++++-- 3 files changed, 510 insertions(+), 68 deletions(-) diff --git a/dacapo/blockwise/predict_worker.py b/dacapo/blockwise/predict_worker.py index 05c08ae3..59a5fbc7 100644 --- a/dacapo/blockwise/predict_worker.py +++ b/dacapo/blockwise/predict_worker.py @@ -46,7 +46,7 @@ def cli(log_level): @click.option( "-i", "--iteration", - type=Optional[int], + type=int, help="The training iteration of the model to use for prediction.", default=None, ) diff --git a/dacapo/examples/distance_task/cosem_finetune_example.ipynb b/dacapo/examples/distance_task/cosem_finetune_example.ipynb index 3517d96e..d50dfcf9 100644 --- a/dacapo/examples/distance_task/cosem_finetune_example.ipynb +++ b/dacapo/examples/distance_task/cosem_finetune_example.ipynb @@ -10,8 +10,7 @@ "from dacapo.store.create_store import create_config_store\n", "\n", "config_store = create_config_store()\n", - "\n", - "" + "\n" ] }, { @@ -43,8 +42,7 @@ "\n", "datasplit = datasplit_config.datasplit_type(datasplit_config)\n", "viewer = datasplit._neuroglancer()\n", - "config_store.store_datasplit_config(datasplit_config)\n", - "" + "config_store.store_datasplit_config(datasplit_config)\n" ] }, { @@ -73,8 +71,7 @@ " tol_distance=40.0,\n", " scale_factor=80.0,\n", ")\n", - "config_store.store_task_config(task_config)\n", - "" + "config_store.store_task_config(task_config)\n" ] }, { @@ -106,8 +103,7 @@ " constant_upsample=True,\n", " upsample_factors=[(2, 2, 2)],\n", ")\n", - "config_store.store_architecture_config(architecture_config)\n", - "" + "config_store.store_architecture_config(architecture_config)\n" ] }, { @@ -154,8 +150,7 @@ " min_masked=0.05,\n", " clip_raw=True,\n", ")\n", - "config_store.store_trainer_config(trainer_config)\n", - "" + "config_store.store_trainer_config(trainer_config)\n" ] }, { @@ -173,7 +168,6 @@ "outputs": [], "source": [ "from dacapo.experiments import RunConfig\n", - "from dacapo.experiments.run import Run\n", "\n", "from dacapo.experiments.starts import CosemStartConfig\n", "\n", @@ -199,8 +193,7 @@ " )\n", "\n", " print(run_config.name)\n", - " config_store.store_run_config(run_config)\n", - "" + " config_store.store_run_config(run_config)\n" ] }, { @@ -225,8 +218,7 @@ "config_store = create_config_store()\n", "\n", "run = Run(config_store.retrieve_run_config(\"cosem_distance_run_4nm_finetune\"))\n", - "train_run(run)\n", - "" + "train_run(run)\n" ] }, { @@ -237,8 +229,6 @@ ] } ], - "nbformat": 4, - "nbformat_minor": 2, "metadata": { "language_info": { "codemirror_mode": { @@ -252,5 +242,7 @@ "pygments_lexer": "ipython3", "version": 3 } - } -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/dacapo/examples/distance_task/synthetic_example.ipynb b/dacapo/examples/distance_task/synthetic_example.ipynb index 3c8d0cfe..986cc68f 100644 --- a/dacapo/examples/distance_task/synthetic_example.ipynb +++ b/dacapo/examples/distance_task/synthetic_example.ipynb @@ -70,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -92,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -102,7 +102,7 @@ " " + "" ] }, - "execution_count": 5, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -160,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -170,7 +170,7 @@ " " + "" ] }, - "execution_count": 6, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -211,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -221,7 +221,7 @@ " " + "" ] }, - "execution_count": 7, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -273,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -287,7 +287,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Neuroglancer link: http://rhoadesj-ws1.hhmi.org:32941/v/d893eb501d1df982c9931ec02a68ee97a11888a5/\n" + "Neuroglancer link: http://rhoadesj-ws1.hhmi.org:35653/v/4f2cdd3e9616cf79e45570c9ef61a29d7a799efb/\n" ] } ], @@ -401,7 +401,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -428,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -462,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -508,7 +508,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -532,7 +532,7 @@ "# )\n", "\n", "iterations = 2000\n", - "validation_interval = 200 # iterations // 2\n", + "validation_interval = iterations // 2\n", "repetitions = 1\n", "for i in range(repetitions):\n", " run_config = RunConfig(\n", @@ -579,7 +579,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -587,31 +587,142 @@ "output_type": "stream", "text": [ "Creating FileConfigStore:\n", - "\tpath: /nrs/cellmap/rhoadesj/dacapo_runs/configs\n", - "\n" + "\tpath: /nrs/cellmap/rhoadesj/dacapo_runs/configs\n" ] }, { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:dacapo.train:Found weights for iteration 2000, but run example_synthetic_distance_run was only trained until 1600. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting/resuming training for run ...\n", + "Creating FileStatsStore:\n", + "\tpath : /nrs/cellmap/rhoadesj/dacapo_runs/stats\n", + "Current state: trained until 1600/2000\n", + "Creating local weights store in directory /nrs/cellmap/rhoadesj/dacapo_runs\n", + "Retrieving weights for run , iteration 2000\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "training until 1700: 80%|████████ | 1600/2000 [00:00, iteration 2000\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updating training stats of run example_synthetic_distance_run after iteration 1600\n", + "Validating run example_synthetic_distance_run on dataset example_validate_[labels]_['labels']_8nm\n", + "Trained until 2000. Finished.\n" + ] } ], "source": [ @@ -644,9 +755,325 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validating run example_synthetic_distance_run at iteration 2000...\n", + "Creating FileConfigStore:\n", + "\tpath: /nrs/cellmap/rhoadesj/dacapo_runs/configs\n", + "Creating FileStatsStore:\n", + "\tpath : /nrs/cellmap/rhoadesj/dacapo_runs/stats\n", + "Validating run example_synthetic_distance_run on dataset example_validate_[labels]_['labels']_8nm\n", + "validation inputs already copied!\n", + "Predicting with input size (1952, 1952, 1952), output size (1216, 1216, 1216)\n", + "Total input ROI: [-368:4016, -368:4016, -368:4016] (4384, 4384, 4384), output ROI: [0:3648, 0:3648, 0:3648] (3648, 3648, 3648)\n", + "Running blockwise prediction with worker_file: /nrs/cellmap/rhoadesj/dacapo/dacapo/blockwise/predict_worker.py\n", + "Defining worker with command: ['/home/rhoadesj@hhmi.org/micromamba/envs/dacapo/bin/python', '/nrs/cellmap/rhoadesj/dacapo/dacapo/blockwise/predict_worker.py', 'start-worker', '--run-name', 'example_synthetic_distance_run', '--input_container', '/nrs/cellmap/rhoadesj/dacapo_runs/example_synthetic_distance_run/validation.zarr', '--input_dataset', \"inputs/example_validate_[labels]_['labels']_8nm/raw\", '--output_container', '/nrs/cellmap/rhoadesj/dacapo_runs/example_synthetic_distance_run/validation.zarr', '--output_dataset', \"2000/example_validate_[labels]_['labels']_8nm/prediction\", '--iteration', '2000']\n", + "Running blockwise with worker_file: /nrs/cellmap/rhoadesj/dacapo/dacapo/blockwise/predict_worker.py\n", + "Using compute context: LocalTorch(_device=None, oom_limit=5)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "validation inputs already copied!\n", + "Predicting with input size (1952, 1952, 1952), output size (1216, 1216, 1216)\n", + "Total input ROI: [-368:4016, -368:4016, -368:4016] (4384, 4384, 4384), output ROI: [0:3648, 0:3648, 0:3648] (3648, 3648, 3648)\n", + "Running blockwise prediction with worker_file: /nrs/cellmap/rhoadesj/dacapo/dacapo/blockwise/predict_worker.py\n", + "Defining worker with command: ['/home/rhoadesj@hhmi.org/micromamba/envs/dacapo/bin/python', '/nrs/cellmap/rhoadesj/dacapo/dacapo/blockwise/predict_worker.py', 'start-worker', '--run-name', 'example_synthetic_distance_run', '--input_container', '/nrs/cellmap/rhoadesj/dacapo_runs/example_synthetic_distance_run/validation.zarr', '--input_dataset', \"inputs/example_validate_[labels]_['labels']_8nm/raw\", '--output_container', '/nrs/cellmap/rhoadesj/dacapo_runs/example_synthetic_distance_run/validation.zarr', '--output_dataset', \"2000/example_validate_[labels]_['labels']_8nm/prediction\", '--iteration', '2000']\n", + "Running blockwise with worker_file: /nrs/cellmap/rhoadesj/dacapo/dacapo/blockwise/predict_worker.py\n", + "Using compute context: LocalTorch(_device=None, oom_limit=5)\n", + "\n", + "Execution Summary\n", + "-----------------\n", + "\n", + " Task predict_worker2024-03-20_17-23-01:\n", + "\n", + " num blocks : 27\n", + " completed ✔: 27 (skipped 0)\n", + " failed ✗: 0\n", + " orphaned ∅: 0\n", + "\n", + " all blocks processed successfully\n", + "Done predicting.\n", + "Predicted on dataset example_validate_[labels]_['labels']_8nm\n", + "Running blockwise with worker_file: /nrs/cellmap/rhoadesj/dacapo/dacapo/blockwise/threshold_worker.py\n", + "Using compute context: LocalTorch(_device=None, oom_limit=5)\n", + "\n", + "Execution Summary\n", + "-----------------\n", + "\n", + " Task threshold_worker2024-03-20_17-23-49:\n", + "\n", + " num blocks : 27\n", + " completed ✔: 27 (skipped 0)\n", + " failed ✗: 0\n", + " orphaned ∅: 0\n", + "\n", + " all blocks processed successfully\n", + "Evaluating binary segmentations on evaluation_data of shape: (456, 456, 456)\n", + "Running blockwise with worker_file: /nrs/cellmap/rhoadesj/dacapo/dacapo/blockwise/threshold_worker.py\n", + "Using compute context: LocalTorch(_device=None, oom_limit=5)\n", + "\n", + "Execution Summary\n", + "-----------------\n", + "\n", + " Task threshold_worker2024-03-20_17-24-44:\n", + "\n", + " num blocks : 27\n", + " completed ✔: 27 (skipped 0)\n", + " failed ✗: 0\n", + " orphaned ∅: 0\n", + "\n", + " all blocks processed successfully\n", + "Evaluating binary segmentations on evaluation_data of shape: (456, 456, 456)\n", + "Running blockwise with worker_file: /nrs/cellmap/rhoadesj/dacapo/dacapo/blockwise/threshold_worker.py\n", + "Using compute context: LocalTorch(_device=None, oom_limit=5)\n", + "\n", + "Execution Summary\n", + "-----------------\n", + "\n", + " Task threshold_worker2024-03-20_17-25-38:\n", + "\n", + " num blocks : 27\n", + " completed ✔: 27 (skipped 0)\n", + " failed ✗: 0\n", + " orphaned ∅: 0\n", + "\n", + " all blocks processed successfully\n", + "Evaluating binary segmentations on evaluation_data of shape: (456, 456, 456)\n", + "Creating FileStatsStore:\n", + "\tpath : /nrs/cellmap/rhoadesj/dacapo_runs/stats\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "88d8e80ddf214ec4a1db5d8c48c08951", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "predict_worker2024-03-20_17-23-01 ▶: 0%| | 0/27 [00:00\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from dacapo.predict import predict\n", "\n", @@ -684,7 +1134,7 @@ ")\n", "\n", "raw_array = open_ds(str(test_data_path), \"raw\")\n", - "pred_array = open_ds(str(test_data_path), \"predictions\")\n", + "pred_array = open_ds(str(test_data_path), f\"prediction_{run_config.name}_{iterations}\")\n", "gt_array = open_ds(str(test_data_path), \"labels\")\n", "\n", "arrays = {\n", From af8b67112bbe9a4489a85965c329f27eadd8bfef Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 20 Mar 2024 17:44:39 -0400 Subject: [PATCH 20/20] =?UTF-8?q?style:=20=F0=9F=8E=A8=20Black=20format.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dacapo/experiments/run.py | 4 +-- dacapo/experiments/starts/cosem_start.py | 16 +++++++---- dacapo/experiments/starts/start.py | 35 ++++++++++++++++-------- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/dacapo/experiments/run.py b/dacapo/experiments/run.py index 3af70139..ece31506 100644 --- a/dacapo/experiments/run.py +++ b/dacapo/experiments/run.py @@ -62,11 +62,11 @@ def __init__(self, run_config): if self.start is None: return else: - if hasattr(run_config.task_config,"channels"): + if hasattr(run_config.task_config, "channels"): new_head = run_config.task_config.channels else: new_head = None - self.start.initialize_weights(self.model,new_head=new_head) + self.start.initialize_weights(self.model, new_head=new_head) @staticmethod def get_validation_scores(run_config) -> ValidationScores: diff --git a/dacapo/experiments/starts/cosem_start.py b/dacapo/experiments/starts/cosem_start.py index 99930cee..fb943b45 100644 --- a/dacapo/experiments/starts/cosem_start.py +++ b/dacapo/experiments/starts/cosem_start.py @@ -6,11 +6,12 @@ logger = logging.getLogger(__file__) + def get_model_setup(run): try: model = cosem.load_model(run) if hasattr(model, "classes_channels"): - classes_channels = model.classes_channels + classes_channels = model.classes_channels else: classes_channels = None if hasattr(model, "voxel_size_input"): @@ -23,9 +24,12 @@ def get_model_setup(run): voxel_size_output = None return classes_channels, voxel_size_input, voxel_size_output except Exception as e: - logger.error(f"could not load model setup: {e} - Not a big deal, model will train wiithout head matching") + logger.error( + f"could not load model setup: {e} - Not a big deal, model will train wiithout head matching" + ) return None, None, None - + + class CosemStart(Start): def __init__(self, start_config): self.run = start_config.run @@ -33,7 +37,9 @@ def __init__(self, start_config): self.name = f"{self.run}/{self.criterion}" channels, voxel_size_input, voxel_size_output = get_model_setup(self.run) if voxel_size_input is not None: - logger.warning(f"Starter model resolution: input {voxel_size_input} output {voxel_size_output}, Make sure to set the correct resolution for the input data.") + logger.warning( + f"Starter model resolution: input {voxel_size_input} output {voxel_size_output}, Make sure to set the correct resolution for the input data." + ) self.channels = channels def check(self): @@ -62,5 +68,3 @@ def initialize_weights(self, model, new_head=None): cosem.download_checkpoint(self.name, path) weights = weights_store._retrieve_weights(self.run, self.criterion) _set_weights(model, weights, self.run, self.criterion, self.channels, new_head) - - diff --git a/dacapo/experiments/starts/start.py b/dacapo/experiments/starts/start.py index 6c162203..dd503d41 100644 --- a/dacapo/experiments/starts/start.py +++ b/dacapo/experiments/starts/start.py @@ -3,9 +3,15 @@ logger = logging.getLogger(__file__) -head_keys = ["prediction_head.weight","prediction_head.bias","chain.1.weight","chain.1.bias"] +head_keys = [ + "prediction_head.weight", + "prediction_head.bias", + "chain.1.weight", + "chain.1.bias", +] -def match_heads(model, head_weights, old_head, new_head ): + +def match_heads(model, head_weights, old_head, new_head): for label in new_head: if label in old_head: logger.warning(f"matching head for {label}.") @@ -17,8 +23,11 @@ def match_heads(model, head_weights, old_head, new_head ): model.state_dict()[key][new_index] = new_value logger.warning(f"matched head for {label}.") + def _set_weights(model, weights, run, criterion, old_head=None, new_head=None): - logger.warning(f"loading weights from run {run}, criterion: {criterion}, old_head {old_head}, new_head: {new_head}") + logger.warning( + f"loading weights from run {run}, criterion: {criterion}, old_head {old_head}, new_head: {new_head}" + ) try: if old_head and new_head: try: @@ -33,7 +42,9 @@ def _set_weights(model, weights, run, criterion, old_head=None, new_head=None): try: model.load_state_dict(weights.model, strict=True) except: - logger.warning("Unable to load model in strict mode. Loading flexibly.") + logger.warning( + "Unable to load model in strict mode. Loading flexibly." + ) model.load_state_dict(weights.model, strict=False) model = match_heads(model, head_weights, old_head, new_head) except RuntimeError as e: @@ -42,7 +53,9 @@ def _set_weights(model, weights, run, criterion, old_head=None, new_head=None): for key in head_keys: weights.model.pop(key, None) model.load_state_dict(weights.model, strict=False) - logger.warning(f"loaded weights in non strict mode from run {run}, criterion: {criterion}") + logger.warning( + f"loaded weights in non strict mode from run {run}, criterion: {criterion}" + ) else: try: model.load_state_dict(weights.model) @@ -54,14 +67,13 @@ def _set_weights(model, weights, run, criterion, old_head=None, new_head=None): for k, v in weights.model.items() if k in model_dict and v.size() == model_dict[k].size() } - model_dict.update( - pretrained_dict - ) + model_dict.update(pretrained_dict) model.load_state_dict(model_dict) logger.warning(f"loaded only common layers from weights") except RuntimeError as e: logger.warning(f"ERROR starter: {e}") + class Start(ABC): """ This class interfaces with the dacapo store to retrieve and load the @@ -90,12 +102,12 @@ def __init__(self, start_config): self.run = start_config.run self.criterion = start_config.criterion - if hasattr(start_config.task_config,"channels"): + if hasattr(start_config.task_config, "channels"): self.channels = start_config.task_config.channels else: - self.channels = None + self.channels = None - def initialize_weights(self, model,new_head=None): + def initialize_weights(self, model, new_head=None): """ Retrieves the weights from the dacapo store and load them into the model. @@ -115,4 +127,3 @@ def initialize_weights(self, model,new_head=None): weights_store = create_weights_store() weights = weights_store._retrieve_weights(self.run, self.criterion) _set_weights(model, weights, self.run, self.criterion, self.channels, new_head) -