diff --git a/.github/workflows/miniwdl_check.yml b/.github/workflows/miniwdl_check.yml index 8a8de2c..34d1a4c 100644 --- a/.github/workflows/miniwdl_check.yml +++ b/.github/workflows/miniwdl_check.yml @@ -1,7 +1,12 @@ name: 'validate WDL' -on: [pull_request] + +on: + pull_request: + branches: [ master, dev ] + env: MINIWDL_VERSION: 1.8.0 + jobs: miniwdl-check: runs-on: ubuntu-latest diff --git a/.github/workflows/run_packaging_check.yml b/.github/workflows/run_packaging_check.yml index 8a9fc1c..6144cbe 100644 --- a/.github/workflows/run_packaging_check.yml +++ b/.github/workflows/run_packaging_check.yml @@ -2,7 +2,9 @@ name: 'packaging' -on: pull_request +on: + pull_request: + branches: [ master, dev ] jobs: build: diff --git a/.github/workflows/run_pytest.yml b/.github/workflows/run_pytest.yml index 7c8c662..b6a02ae 100644 --- a/.github/workflows/run_pytest.yml +++ b/.github/workflows/run_pytest.yml @@ -2,7 +2,9 @@ name: 'pytest' -on: pull_request +on: + pull_request: + branches: [ master, dev ] jobs: build: diff --git a/.gitignore b/.gitignore index 80b6c0c..4d5b686 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ dist/ *.csv *.npz *.tar.gz +data/ \ No newline at end of file diff --git a/README.rst b/README.rst index d5895c7..f32bdef 100644 --- a/README.rst +++ b/README.rst @@ -36,6 +36,32 @@ The current release contains the following modules. More modules will be added i Please refer to `the documentation `_ for a quick start tutorial. + WARNING: + + The release tagged v0.3.1 included a bug which caused output count matrices to be incorrect. `The bug + `_, + introduced in `#303 `_, compromised output denoised count matrices + (due to an integer overflow) and would often show up as negative entries in the output count matrices. The bug also existed on + the master branch until `#347 `_. + + For now, we recommend using either v0.3.0 or the master branch (after #347) until v0.3.2 is released, and then using v0.3.2. + + Outputs generated with v0.3.1 (or the master branch between #303 and #347) can be salvaged by making use of the + checkpoint file, which is not compromised. The following command will re-run the (inexpensive, CPU-only) + estimation of the output count matrix using the saved posterior in the checkpoint file. Note the use of the new + input argument ``--force-use-checkpoint`` which will allow use of a checkpoint file produced by a different CellBender version: + + .. code-block:: console + + (cellbender) $ cellbender remove-background \ + --input my_raw_count_matrix_file.h5 \ + --output my_cellbender_output_file.h5 \ + --checkpoint path/to/ckpt.tar.gz \ + --force-use-checkpoint + + where ``path/to/ckpt.tar.gz`` is the path to the checkpoint file generated by the original run. Ensure that you pair up the right + ``--input`` with the right ``--checkpoint``. + Installation and Usage ---------------------- diff --git a/cellbender/remove_background/argparser.py b/cellbender/remove_background/argparser.py index d53aca3..aebe670 100644 --- a/cellbender/remove_background/argparser.py +++ b/cellbender/remove_background/argparser.py @@ -48,10 +48,19 @@ def add_subparser_args(subparsers: argparse) -> argparse: subparser.add_argument("--checkpoint", nargs=None, type=str, dest='input_checkpoint_tarball', required=False, default=consts.CHECKPOINT_FILE_NAME, - help="Checkpoint tarball produced by the same version " + help="Checkpoint tarball produced by v0.3.0+ " "of CellBender remove-background. If present, " "and the workflow hashes match, training will " "restart from this checkpoint.") + subparser.add_argument("--force-use-checkpoint", + dest='force_use_checkpoint', action="store_true", + help="Normally, checkpoints can only be used if the CellBender " + "code and certain input args match exactly. This flag allows you " + "to bypass this requirement. An example use would be to create a new output " + "using a checkpoint from a run of v0.3.1, a redacted version with " + "faulty output count matrices. If you use this flag, " + "ensure that the input file and the checkpoint match, because " + "CellBender will not check.") subparser.add_argument("--expected-cells", nargs=None, type=int, default=None, dest="expected_cell_count", diff --git a/cellbender/remove_background/checkpoint.py b/cellbender/remove_background/checkpoint.py index 206737b..44ed12e 100644 --- a/cellbender/remove_background/checkpoint.py +++ b/cellbender/remove_background/checkpoint.py @@ -154,7 +154,8 @@ def save_checkpoint(filebase: str, def load_checkpoint(filebase: Optional[str], tarball_name: str = consts.CHECKPOINT_FILE_NAME, - force_device: Optional[str] = None)\ + force_device: Optional[str] = None, + force_use_checkpoint: bool = False)\ -> Dict[str, Union['RemoveBackgroundPyroModel', pyro.optim.PyroOptim, DataLoader, bool]]: """Load checkpoint and prepare a RemoveBackgroundPyroModel and optimizer.""" @@ -163,6 +164,7 @@ def load_checkpoint(filebase: Optional[str], tarball_name=tarball_name, to_load=['model', 'optim', 'param_store', 'dataloader', 'args', 'random_state'], force_device=force_device, + force_use_checkpoint=force_use_checkpoint, ) out.update({'loaded': True}) logger.info(f'Loaded partially-trained checkpoint from {tarball_name}') @@ -172,7 +174,8 @@ def load_checkpoint(filebase: Optional[str], def load_from_checkpoint(filebase: Optional[str], tarball_name: str = consts.CHECKPOINT_FILE_NAME, to_load: List[str] = ['model'], - force_device: Optional[str] = None) -> Dict: + force_device: Optional[str] = None, + force_use_checkpoint: bool = False) -> Dict: """Load specific files from a checkpoint tarball.""" load_kwargs = {} @@ -192,19 +195,24 @@ def load_from_checkpoint(filebase: Optional[str], else: # no tarball loaded, so do not continue trying to load files raise FileNotFoundError - - # See if files have a hash matching input filebase. - if filebase is not None: + + # If posterior is present, do not require run hash to match: will pick up + # after training and run estimation from existing posterior. + # This smoothly allows re-runs (including for problematic v0.3.1) + logger.debug(f'force_use_checkpoint: {force_use_checkpoint}') + if force_use_checkpoint or (filebase is None): + filebase = (glob.glob(os.path.join(tmp_dir, '*_model.torch'))[0] + .replace('_model.torch', '')) + logger.debug(f'Accepting any file hash, so loading {filebase}*') + + else: + # See if files have a hash matching input filebase. basename = os.path.basename(filebase) filebase = os.path.join(tmp_dir, basename) logger.debug(f'Looking for files with base name matching {filebase}*') if not os.path.exists(filebase + '_model.torch'): logger.info('Workflow hash does not match that of checkpoint.') - raise ValueError('Workflow hash does not match that of checkpoint.') - else: - filebase = (glob.glob(os.path.join(tmp_dir, '*_model.torch'))[0] - .replace('_model.torch', '')) - logger.debug(f'Accepting any file hash, so loading {filebase}*') + raise ValueError('Workflow hash does not match that of checkpoint.') out = {} @@ -265,9 +273,10 @@ def load_from_checkpoint(filebase: Optional[str], return out -def attempt_load_checkpoint(filebase: str, +def attempt_load_checkpoint(filebase: Optional[str], tarball_name: str = consts.CHECKPOINT_FILE_NAME, - force_device: Optional[str] = None)\ + force_device: Optional[str] = None, + force_use_checkpoint: bool = False)\ -> Dict[str, Union['RemoveBackgroundPyroModel', pyro.optim.PyroOptim, DataLoader, bool]]: """Load checkpoint and prepare a RemoveBackgroundPyroModel and optimizer, or return the inputs if loading fails.""" @@ -276,7 +285,8 @@ def attempt_load_checkpoint(filebase: str, logger.debug('Attempting to load checkpoint from ' + tarball_name) return load_checkpoint(filebase=filebase, tarball_name=tarball_name, - force_device=force_device) + force_device=force_device, + force_use_checkpoint=force_use_checkpoint) except FileNotFoundError: logger.debug('No tarball found') diff --git a/cellbender/remove_background/estimation.py b/cellbender/remove_background/estimation.py index b4866d2..777bf03 100644 --- a/cellbender/remove_background/estimation.py +++ b/cellbender/remove_background/estimation.py @@ -21,6 +21,10 @@ logger = logging.getLogger('cellbender') +N_CELLS_DATATYPE = np.int32 +N_GENES_DATATYPE = np.int32 +COUNT_DATATYPE = np.int32 + class EstimationMethod(ABC): """Base class for estimation of noise counts, given a posterior.""" @@ -52,7 +56,7 @@ def _estimation_array_to_csr(self, data: np.ndarray, m: np.ndarray, noise_offsets: Optional[Dict[int, int]], - dtype=np.int64) -> sp.csr_matrix: + dtype=COUNT_DATATYPE) -> sp.csr_matrix: """Say you have point estimates for each count matrix element (data) and you have the 'm'-indices for each value (m). This returns a CSR matrix that has the shape of the count matrix, where duplicate entries have @@ -218,7 +222,7 @@ def _estimation_array_to_csr(index_converter, data: np.ndarray, m: np.ndarray, noise_offsets: Optional[Dict[int, int]], - dtype=np.int) -> sp.csr_matrix: + dtype=COUNT_DATATYPE) -> sp.csr_matrix: """Say you have point estimates for each count matrix element (data) and you have the 'm'-indices for each value (m). This returns a CSR matrix that has the shape of the count matrix, where duplicate entries have @@ -238,7 +242,7 @@ def _estimation_array_to_csr(index_converter, row, col = index_converter.get_ng_indices(m_inds=m) if noise_offsets is not None: data = data + np.array([noise_offsets.get(i, 0) for i in m]) - coo = sp.coo_matrix((data.astype(dtype), (row.astype(np.uint64), col.astype(np.uint8))), + coo = sp.coo_matrix((data.astype(dtype), (row.astype(N_CELLS_DATATYPE), col.astype(N_GENES_DATATYPE))), shape=index_converter.matrix_shape, dtype=dtype) coo.sum_duplicates() return coo.tocsr() diff --git a/cellbender/remove_background/posterior.py b/cellbender/remove_background/posterior.py index d99e533..5bdd890 100644 --- a/cellbender/remove_background/posterior.py +++ b/cellbender/remove_background/posterior.py @@ -107,13 +107,15 @@ def _do_posterior_regularization(posterior: Posterior): try: ckpt_posterior = load_from_checkpoint(tarball_name=args.input_checkpoint_tarball, filebase=args.checkpoint_filename, - to_load=['posterior']) + to_load=['posterior'], + force_use_checkpoint=args.force_use_checkpoint) except ValueError: # input checkpoint tarball was not a match for this workflow # but we still may have saved a new tarball ckpt_posterior = load_from_checkpoint(tarball_name=consts.CHECKPOINT_FILE_NAME, filebase=args.checkpoint_filename, - to_load=['posterior']) + to_load=['posterior'], + force_use_checkpoint=args.force_use_checkpoint) if os.path.exists(ckpt_posterior.get('posterior_file', 'does_not_exist')): # Load posterior if it was saved in the checkpoint. posterior.load(file=ckpt_posterior['posterior_file']) diff --git a/cellbender/remove_background/run.py b/cellbender/remove_background/run.py index ca2a592..7ac466b 100644 --- a/cellbender/remove_background/run.py +++ b/cellbender/remove_background/run.py @@ -307,6 +307,9 @@ def compute_output_denoised_counts_reports_metrics(posterior: Posterior, posterior.latents_map['p'], ) + # Failsafe to ensure no negative counts. + assert np.all(denoised_counts.data >= 0), 'Negative count matrix entries in output' + # TODO: correct cell probabilities so that any zero-count droplet becomes "empty" # Save denoised count matrix. @@ -627,7 +630,8 @@ def run_inference(dataset_obj: SingleCellRNACountsDataset, # Attempt to load from a previously-saved checkpoint. ckpt = attempt_load_checkpoint(filebase=checkpoint_filename, tarball_name=args.input_checkpoint_tarball, - force_device='cuda:0' if args.use_cuda else 'cpu') + force_device='cuda:0' if args.use_cuda else 'cpu', + force_use_checkpoint=args.force_use_checkpoint) ckpt_loaded = ckpt['loaded'] # True if a checkpoint was loaded successfully if ckpt_loaded: diff --git a/cellbender/remove_background/tests/test_checkpoint.py b/cellbender/remove_background/tests/test_checkpoint.py index 59ccb9e..0dbaa22 100644 --- a/cellbender/remove_background/tests/test_checkpoint.py +++ b/cellbender/remove_background/tests/test_checkpoint.py @@ -414,6 +414,7 @@ def test_save_and_load_cellbender_checkpoint(tmpdir_factory, cuda, scheduler): args.constant_learning_rate = not scheduler args.debug = False args.input_checkpoint_tarball = 'none' + args.force_use_checkpoint = False create_random_state_blank_slate(0) pyro.clear_param_store() diff --git a/cellbender/remove_background/tests/test_estimation.py b/cellbender/remove_background/tests/test_estimation.py index c1cb931..2cca641 100644 --- a/cellbender/remove_background/tests/test_estimation.py +++ b/cellbender/remove_background/tests/test_estimation.py @@ -6,9 +6,10 @@ import torch from cellbender.remove_background.estimation import Mean, MAP, \ - SingleSample, ThresholdCDF, MultipleChoiceKnapsack, pandas_grouped_apply + SingleSample, ThresholdCDF, MultipleChoiceKnapsack, pandas_grouped_apply, _estimation_array_to_csr, COUNT_DATATYPE from cellbender.remove_background.posterior import IndexConverter, \ dense_to_sparse_op_torch, log_prob_sparse_to_dense +from cellbender.remove_background.tests.conftest import sparse_matrix_equal from typing import Dict, Union @@ -92,11 +93,15 @@ def test_mean_massive_m(log_prob_coo): new_shape = (coo.shape[0] + greater_than_max_int32, coo.shape[1]) new_coo = sp.coo_matrix((coo.data, (new_row, coo.col)), shape=new_shape) + print(f'original COO shape: {coo.shape}') + print(f'new COO shape: {new_coo.shape}') + print(f'new row minimum value: {new_coo.row.min()}') + print(f'new row maximum value: {new_coo.row.max()}') offset_dict = {k + greater_than_max_int32: v for k, v in log_prob_coo['offsets'].items()} # this is just a shim - converter = IndexConverter(total_n_cells=2, - total_n_genes=new_coo.shape[0]) + converter = IndexConverter(total_n_cells=new_coo.shape[0], + total_n_genes=new_coo.shape[1]) # set up and estimate estimator = Mean(index_converter=converter) @@ -379,3 +384,28 @@ def test_parallel_pandas_grouped_apply(fun): np.testing.assert_array_equal(reg['m'], parallel['m']) np.testing.assert_array_equal(reg['result'], parallel['result']) + + +def test_estimation_array_to_csr(): + + larger_than_uint16 = 2**16 + 1 + + converter = IndexConverter(total_n_cells=larger_than_uint16, + total_n_genes=larger_than_uint16) + m = larger_than_uint16 + np.arange(-10, 10) + data = np.random.rand(len(m)) * -10 + noise_offsets = None + + output_csr = _estimation_array_to_csr(index_converter=converter, data=data, m=m, noise_offsets=noise_offsets, dtype=COUNT_DATATYPE) + + # reimplementation here with totally permissive datatypes + cell_and_gene_dtype = np.float64 + row, col = converter.get_ng_indices(m_inds=m) + if noise_offsets is not None: + data = data + np.array([noise_offsets.get(i, 0) for i in m]) + coo = sp.coo_matrix((data.astype(COUNT_DATATYPE), (row.astype(cell_and_gene_dtype), col.astype(cell_and_gene_dtype))), + shape=converter.matrix_shape, dtype=COUNT_DATATYPE) + coo.sum_duplicates() + truth_csr = coo.tocsr() + + assert sparse_matrix_equal(output_csr, truth_csr) diff --git a/cellbender/remove_background/tests/test_integration.py b/cellbender/remove_background/tests/test_integration.py index 624c59d..078e9f2 100644 --- a/cellbender/remove_background/tests/test_integration.py +++ b/cellbender/remove_background/tests/test_integration.py @@ -50,3 +50,6 @@ def test_full_run(tmpdir_factory, h5_v3_file, cuda): adata_cell_barcodes = adata.obs_names[adata.obs['cell_probability'] > consts.CELL_PROB_CUTOFF] assert set(cell_barcodes) == set(adata_cell_barcodes), \ 'Cell barcodes in h5 are different from those in CSV file' + + # ensure there are no negative count matrix entries in the output + assert np.all(adata.X.data >= 0), 'Negative count matrix entries in output' diff --git a/requirements.txt b/requirements.txt index 3585b6a..9b68a3c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ jupyter jupyter_contrib_nbextensions notebook<7.0.0 nbconvert<7.0.0 +lxml_html_clean psutil