Fix posterior and estimator integer overflow bugs on Windows (#259)

broadinstitute · Oct 31, 2023 · 12b4758 · 12b4758
1 parent 322971d
commit 12b4758
Show file tree

Hide file tree

Showing 7 changed files with 83 additions and 21 deletions.
diff --git a/.github/workflows/run_pytest.yml b/.github/workflows/run_pytest.yml
@@ -1,4 +1,4 @@
-# Run cellbender's tests
+# Run cellbender test suite
 
 name: 'pytest'
 
@@ -7,11 +7,13 @@ on: pull_request
 jobs:
   build:
 
-    runs-on: 'ubuntu-latest'
     strategy:
       matrix:
+        os: ['ubuntu-latest', 'windows-latest']
         python-version: ['3.7']
 
+    runs-on: ${{ matrix.os }}
+
     steps:
     - name: 'Checkout repo'
       uses: actions/checkout@v3

diff --git a/cellbender/remove_background/checkpoint.py b/cellbender/remove_background/checkpoint.py
@@ -297,7 +297,7 @@ def make_tarball(files: List[str], tarball_name: str) -> bool:
         for file in files:
             # without arcname, unpacking results in unpredictable file locations!
             tar.add(file, arcname=os.path.basename(file))
-    os.rename(tarball_name + '.tmp', tarball_name)
+    os.replace(tarball_name + '.tmp', tarball_name)
     return True
 
 

diff --git a/cellbender/remove_background/estimation.py b/cellbender/remove_background/estimation.py
@@ -218,7 +218,7 @@ def _estimation_array_to_csr(index_converter,
                              data: np.ndarray,
                              m: np.ndarray,
                              noise_offsets: Optional[Dict[int, int]],
-                             dtype=np.int64) -> sp.csr_matrix:
+                             dtype=np.int) -> sp.csr_matrix:
     """Say you have point estimates for each count matrix element (data) and
     you have the 'm'-indices for each value (m). This returns a CSR matrix
     that has the shape of the count matrix, where duplicate entries have
@@ -229,7 +229,7 @@ def _estimation_array_to_csr(index_converter,
             a flat format, indexed by 'm'.
         m: Array of the same length as data, where each entry is an m-index.
         noise_offsets: Noise count offset values keyed by 'm'.
-        dtype: Data type for sparse matrix. Int32 is too small for 'm' indices.
+        dtype: Data type for values of sparse matrix
 
     Results:
         noise_csr: Noise point estimate, as a CSR sparse matrix.
@@ -238,7 +238,7 @@ def _estimation_array_to_csr(index_converter,
     row, col = index_converter.get_ng_indices(m_inds=m)
     if noise_offsets is not None:
         data = data + np.array([noise_offsets.get(i, 0) for i in m])
-    coo = sp.coo_matrix((data.astype(dtype), (row.astype(dtype), col.astype(dtype))),
+    coo = sp.coo_matrix((data.astype(dtype), (row.astype(np.uint64), col.astype(np.uint8))),
                         shape=index_converter.matrix_shape, dtype=dtype)
     coo.sum_duplicates()
     return coo.tocsr()
@@ -785,7 +785,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
     """
     array_length = len(np.unique(noise_log_prob_coo.row))
 
-    m = np.zeros(array_length)
+    m = np.zeros(array_length, dtype=np.uint64)
     out = np.zeros(array_length)
     a = 0
 
@@ -804,7 +804,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
         out[a:(a + len_s)] = s.detach().cpu().numpy()
         a = a + len_s
 
-    return {'m': m.astype(int), 'result': out}
+    return {'m': m, 'result': out}
 
 
 def pandas_grouped_apply(coo: sp.coo_matrix,

diff --git a/cellbender/remove_background/posterior.py b/cellbender/remove_background/posterior.py
@@ -555,7 +555,7 @@ def _get_cell_noise_count_posterior_coo(
         # Put the counts into a sparse csr_matrix.
         self._noise_count_posterior_coo = sp.coo_matrix(
             (log_probs, (m, c)),
-            shape=[np.prod(self.count_matrix_shape), n_counts_max],
+            shape=[np.prod(self.count_matrix_shape, dtype=np.uint64), n_counts_max],
         )
         self._noise_count_posterior_coo_offsets = nonzero_noise_offset_dict
         return self._noise_count_posterior_coo
@@ -1528,7 +1528,7 @@ def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndar
         if not ((gene_inds >= 0) & (gene_inds < self.total_n_genes)).all():
             raise ValueError(f'Requested gene_inds out of range: '
                              f'{gene_inds[(gene_inds < 0) | (gene_inds >= self.total_n_genes)]}')
-        return cell_inds * self.total_n_genes + gene_inds
+        return cell_inds.astype(np.uint64) * self.total_n_genes + gene_inds.astype(np.uint64)
 
     def get_ng_indices(self, m_inds: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
         """Given a list of 'm' index values, return two arrays: cell index values

diff --git a/cellbender/remove_background/tests/conftest.py b/cellbender/remove_background/tests/conftest.py
@@ -15,12 +15,32 @@
 USE_CUDA = torch.cuda.is_available()
 
 
-def sparse_matrix_equal(mat1: sp.csc_matrix,
-                        mat2: sp.csc_matrix):
+def sparse_matrix_equal(mat1, mat2):
     """Fast assertion that sparse matrices are equal"""
+    if type(mat1) == sp.coo_matrix:
+        return coo_equal(mat1, mat2)
+    elif (type(mat1) == sp.csc_matrix) or (type(mat1) == sp.csr_matrix):
+        return csc_or_csr_equal(mat1, mat2)
+    else:
+        raise ValueError(f"Error with test functions: sparse_matrix_equal() was called with a {type(mat1)}")
+
+
+def csc_or_csr_equal(mat1: sp.csc_matrix,
+                     mat2: sp.csc_matrix):
+    """Fast assertion that CSC sparse matrices are equal"""
     return (mat1 != mat2).sum() == 0
 
 
+def coo_equal(mat1: sp.csc_matrix,
+              mat2: sp.csc_matrix):
+    """Fast assertion that COO sparse matrices are equal"""
+    return (
+        ((mat1.data != mat2.data).sum() == 0)
+        and ((mat1.row != mat2.row).sum() == 0)
+        and ((mat1.col != mat2.col).sum() == 0)
+    )
+
+
 def tensors_equal(a: torch.Tensor,
                   b: torch.Tensor):
     """Assertion that torch tensors are equal for each element"""

diff --git a/cellbender/remove_background/tests/test_estimation.py b/cellbender/remove_background/tests/test_estimation.py
@@ -81,6 +81,29 @@ def log_prob_coo(request, log_prob_coo_base) \
         raise ValueError(f'Test writing error: requested "{request.param}" log_prob_coo')
 
 
+def test_mean_massive_m(log_prob_coo):
+    """Sets up a posterior COO with massive m values that are > max(int32).
+    Will trigger github issue #252 if a bug exists, no assertion necessary.
+    """
+
+    coo = log_prob_coo['coo']
+    greater_than_max_int32 = 2200000000
+    new_row = coo.row.astype(np.int64) + greater_than_max_int32
+    new_shape = (coo.shape[0] + greater_than_max_int32, coo.shape[1])
+    new_coo = sp.coo_matrix((coo.data, (new_row, coo.col)),
+                            shape=new_shape)
+    offset_dict = {k + greater_than_max_int32: v for k, v in log_prob_coo['offsets'].items()}
+
+    # this is just a shim
+    converter = IndexConverter(total_n_cells=2,
+                               total_n_genes=new_coo.shape[0])
+
+    # set up and estimate
+    estimator = Mean(index_converter=converter)
+    noise_csr = estimator.estimate_noise(noise_log_prob_coo=new_coo,
+                                         noise_offsets=offset_dict)
+
+
 @pytest.fixture(scope='module', params=['exact', 'filtered', 'unsorted'])
 def mckp_log_prob_coo(request, log_prob_coo_base) \
         -> Dict[str, Union[sp.coo_matrix, np.ndarray, Dict[int, int]]]:

diff --git a/cellbender/remove_background/tests/test_posterior.py b/cellbender/remove_background/tests/test_posterior.py
@@ -368,23 +368,35 @@ def test_compute_mean_target_removal_as_function(log_prob_coo, fpr, per_gene, cu
 
 
 @pytest.mark.parametrize('blank_noise_offsets', [False, True], ids=['', 'no_noise_offsets'])
-def test_save_and_load(tmpdir_factory, blank_noise_offsets):
+@pytest.mark.parametrize('m', [1000, 2200000000], ids=['small', 'big'])
+def test_save_and_load(tmpdir_factory, blank_noise_offsets, m):
     """Test that a round trip through save and load gives the same thing"""
 
     tmp_dir = tmpdir_factory.mktemp('posterior')
     filename = tmp_dir.join('posterior.h5')
 
-    m = 1000
     n = 20
+    num_nonzeros = 1000
 
     posterior = Posterior(dataset_obj=None, vi_model=None)  # blank
 
-    posterior_coo = sp.random(m, n, density=0.1, format='coo', dtype=float)
-    posterior_coo2 = sp.random(m, n, density=0.08, format='coo', dtype=float)
+    # old way that cannot handle large m
+    # posterior_coo = sp.random(m, n, density=0.1, format='coo', dtype=float)
+    # posterior_coo2 = sp.random(m, n, density=0.08, format='coo', dtype=float)
+
+    m_array = np.random.randint(low=0, high=m, size=num_nonzeros - 1, dtype=np.uint64)
+    m_array = np.concatenate([m_array, np.array(m - 1, dtype=np.uint64)], axis=None)
+    n_array = np.random.randint(low=0, high=n, size=num_nonzeros)
+    val_array = np.random.rand(num_nonzeros) * -10
+    val_array2 = np.random.rand(num_nonzeros) * -5
+
+    posterior_coo = sp.coo_matrix((val_array, (m_array, n_array)), shape=(m, n))
+    posterior_coo2 = sp.coo_matrix((val_array2, (m_array, n_array)), shape=(m, n))
+
     if blank_noise_offsets:
         noise_offsets = {}
     else:
-        noise_offsets = dict(zip(np.random.randint(low=0, high=(m - 1), size=10),
+        noise_offsets = dict(zip(np.random.randint(low=0, high=(m - 1), size=10, dtype=np.uint64),
                                  np.random.randint(low=1, high=5, size=10)))
     kwargs = {'a': 'b', 'c': 1}
     kwargs2 = {'a': 'method', 'c': 1}
@@ -396,7 +408,7 @@ def test_save_and_load(tmpdir_factory, blank_noise_offsets):
     posterior._noise_count_regularized_posterior_coo = posterior_coo2
     posterior._noise_count_regularized_posterior_kwargs = kwargs2
     posterior._latents = {'p': np.random.randn(100), 'd': np.random.randn(100)}
-    posterior.index_converter = IndexConverter(total_n_cells=1000, total_n_genes=1000)
+    posterior.index_converter = IndexConverter(total_n_cells=max(1000, m // 1000 + 1), total_n_genes=1000)
 
     # save
     posterior.save(file=str(filename))
@@ -406,9 +418,14 @@ def test_save_and_load(tmpdir_factory, blank_noise_offsets):
     posterior2.load(file=str(filename))
 
     # check
-    for attr in ['_noise_count_posterior_coo', '_noise_count_posterior_coo_offsets',
-                 '_noise_count_posterior_kwargs', '_noise_count_regularized_posterior_coo',
-                 '_noise_count_regularized_posterior_kwargs', '_latents']:
+    for attr in [
+        '_noise_count_posterior_coo',
+        '_noise_count_posterior_coo_offsets',
+        '_noise_count_posterior_kwargs',
+        '_noise_count_regularized_posterior_coo',
+        '_noise_count_regularized_posterior_kwargs',
+        '_latents'
+    ]:
         val1 = getattr(posterior, attr)
         val2 = getattr(posterior2, attr)
         print(f'{attr} ===================')