Skip to content

Commit

Permalink
Fix posterior and estimator integer overflow bugs on Windows (#259)
Browse files Browse the repository at this point in the history
  • Loading branch information
sjfleming authored Oct 31, 2023
1 parent 322971d commit 12b4758
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 21 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/run_pytest.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Run cellbender's tests
# Run cellbender test suite

name: 'pytest'

Expand All @@ -7,11 +7,13 @@ on: pull_request
jobs:
build:

runs-on: 'ubuntu-latest'
strategy:
matrix:
os: ['ubuntu-latest', 'windows-latest']
python-version: ['3.7']

runs-on: ${{ matrix.os }}

steps:
- name: 'Checkout repo'
uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion cellbender/remove_background/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def make_tarball(files: List[str], tarball_name: str) -> bool:
for file in files:
# without arcname, unpacking results in unpredictable file locations!
tar.add(file, arcname=os.path.basename(file))
os.rename(tarball_name + '.tmp', tarball_name)
os.replace(tarball_name + '.tmp', tarball_name)
return True


Expand Down
10 changes: 5 additions & 5 deletions cellbender/remove_background/estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def _estimation_array_to_csr(index_converter,
data: np.ndarray,
m: np.ndarray,
noise_offsets: Optional[Dict[int, int]],
dtype=np.int64) -> sp.csr_matrix:
dtype=np.int) -> sp.csr_matrix:
"""Say you have point estimates for each count matrix element (data) and
you have the 'm'-indices for each value (m). This returns a CSR matrix
that has the shape of the count matrix, where duplicate entries have
Expand All @@ -229,7 +229,7 @@ def _estimation_array_to_csr(index_converter,
a flat format, indexed by 'm'.
m: Array of the same length as data, where each entry is an m-index.
noise_offsets: Noise count offset values keyed by 'm'.
dtype: Data type for sparse matrix. Int32 is too small for 'm' indices.
dtype: Data type for values of sparse matrix
Results:
noise_csr: Noise point estimate, as a CSR sparse matrix.
Expand All @@ -238,7 +238,7 @@ def _estimation_array_to_csr(index_converter,
row, col = index_converter.get_ng_indices(m_inds=m)
if noise_offsets is not None:
data = data + np.array([noise_offsets.get(i, 0) for i in m])
coo = sp.coo_matrix((data.astype(dtype), (row.astype(dtype), col.astype(dtype))),
coo = sp.coo_matrix((data.astype(dtype), (row.astype(np.uint64), col.astype(np.uint8))),
shape=index_converter.matrix_shape, dtype=dtype)
coo.sum_duplicates()
return coo.tocsr()
Expand Down Expand Up @@ -785,7 +785,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
"""
array_length = len(np.unique(noise_log_prob_coo.row))

m = np.zeros(array_length)
m = np.zeros(array_length, dtype=np.uint64)
out = np.zeros(array_length)
a = 0

Expand All @@ -804,7 +804,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
out[a:(a + len_s)] = s.detach().cpu().numpy()
a = a + len_s

return {'m': m.astype(int), 'result': out}
return {'m': m, 'result': out}


def pandas_grouped_apply(coo: sp.coo_matrix,
Expand Down
4 changes: 2 additions & 2 deletions cellbender/remove_background/posterior.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@ def _get_cell_noise_count_posterior_coo(
# Put the counts into a sparse csr_matrix.
self._noise_count_posterior_coo = sp.coo_matrix(
(log_probs, (m, c)),
shape=[np.prod(self.count_matrix_shape), n_counts_max],
shape=[np.prod(self.count_matrix_shape, dtype=np.uint64), n_counts_max],
)
self._noise_count_posterior_coo_offsets = nonzero_noise_offset_dict
return self._noise_count_posterior_coo
Expand Down Expand Up @@ -1528,7 +1528,7 @@ def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndar
if not ((gene_inds >= 0) & (gene_inds < self.total_n_genes)).all():
raise ValueError(f'Requested gene_inds out of range: '
f'{gene_inds[(gene_inds < 0) | (gene_inds >= self.total_n_genes)]}')
return cell_inds * self.total_n_genes + gene_inds
return cell_inds.astype(np.uint64) * self.total_n_genes + gene_inds.astype(np.uint64)

def get_ng_indices(self, m_inds: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Given a list of 'm' index values, return two arrays: cell index values
Expand Down
24 changes: 22 additions & 2 deletions cellbender/remove_background/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,32 @@
USE_CUDA = torch.cuda.is_available()


def sparse_matrix_equal(mat1: sp.csc_matrix,
mat2: sp.csc_matrix):
def sparse_matrix_equal(mat1, mat2):
"""Fast assertion that sparse matrices are equal"""
if type(mat1) == sp.coo_matrix:
return coo_equal(mat1, mat2)
elif (type(mat1) == sp.csc_matrix) or (type(mat1) == sp.csr_matrix):
return csc_or_csr_equal(mat1, mat2)
else:
raise ValueError(f"Error with test functions: sparse_matrix_equal() was called with a {type(mat1)}")


def csc_or_csr_equal(mat1: sp.csc_matrix,
mat2: sp.csc_matrix):
"""Fast assertion that CSC sparse matrices are equal"""
return (mat1 != mat2).sum() == 0


def coo_equal(mat1: sp.csc_matrix,
mat2: sp.csc_matrix):
"""Fast assertion that COO sparse matrices are equal"""
return (
((mat1.data != mat2.data).sum() == 0)
and ((mat1.row != mat2.row).sum() == 0)
and ((mat1.col != mat2.col).sum() == 0)
)


def tensors_equal(a: torch.Tensor,
b: torch.Tensor):
"""Assertion that torch tensors are equal for each element"""
Expand Down
23 changes: 23 additions & 0 deletions cellbender/remove_background/tests/test_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,29 @@ def log_prob_coo(request, log_prob_coo_base) \
raise ValueError(f'Test writing error: requested "{request.param}" log_prob_coo')


def test_mean_massive_m(log_prob_coo):
"""Sets up a posterior COO with massive m values that are > max(int32).
Will trigger github issue #252 if a bug exists, no assertion necessary.
"""

coo = log_prob_coo['coo']
greater_than_max_int32 = 2200000000
new_row = coo.row.astype(np.int64) + greater_than_max_int32
new_shape = (coo.shape[0] + greater_than_max_int32, coo.shape[1])
new_coo = sp.coo_matrix((coo.data, (new_row, coo.col)),
shape=new_shape)
offset_dict = {k + greater_than_max_int32: v for k, v in log_prob_coo['offsets'].items()}

# this is just a shim
converter = IndexConverter(total_n_cells=2,
total_n_genes=new_coo.shape[0])

# set up and estimate
estimator = Mean(index_converter=converter)
noise_csr = estimator.estimate_noise(noise_log_prob_coo=new_coo,
noise_offsets=offset_dict)


@pytest.fixture(scope='module', params=['exact', 'filtered', 'unsorted'])
def mckp_log_prob_coo(request, log_prob_coo_base) \
-> Dict[str, Union[sp.coo_matrix, np.ndarray, Dict[int, int]]]:
Expand Down
35 changes: 26 additions & 9 deletions cellbender/remove_background/tests/test_posterior.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,23 +368,35 @@ def test_compute_mean_target_removal_as_function(log_prob_coo, fpr, per_gene, cu


@pytest.mark.parametrize('blank_noise_offsets', [False, True], ids=['', 'no_noise_offsets'])
def test_save_and_load(tmpdir_factory, blank_noise_offsets):
@pytest.mark.parametrize('m', [1000, 2200000000], ids=['small', 'big'])
def test_save_and_load(tmpdir_factory, blank_noise_offsets, m):
"""Test that a round trip through save and load gives the same thing"""

tmp_dir = tmpdir_factory.mktemp('posterior')
filename = tmp_dir.join('posterior.h5')

m = 1000
n = 20
num_nonzeros = 1000

posterior = Posterior(dataset_obj=None, vi_model=None) # blank

posterior_coo = sp.random(m, n, density=0.1, format='coo', dtype=float)
posterior_coo2 = sp.random(m, n, density=0.08, format='coo', dtype=float)
# old way that cannot handle large m
# posterior_coo = sp.random(m, n, density=0.1, format='coo', dtype=float)
# posterior_coo2 = sp.random(m, n, density=0.08, format='coo', dtype=float)

m_array = np.random.randint(low=0, high=m, size=num_nonzeros - 1, dtype=np.uint64)
m_array = np.concatenate([m_array, np.array(m - 1, dtype=np.uint64)], axis=None)
n_array = np.random.randint(low=0, high=n, size=num_nonzeros)
val_array = np.random.rand(num_nonzeros) * -10
val_array2 = np.random.rand(num_nonzeros) * -5

posterior_coo = sp.coo_matrix((val_array, (m_array, n_array)), shape=(m, n))
posterior_coo2 = sp.coo_matrix((val_array2, (m_array, n_array)), shape=(m, n))

if blank_noise_offsets:
noise_offsets = {}
else:
noise_offsets = dict(zip(np.random.randint(low=0, high=(m - 1), size=10),
noise_offsets = dict(zip(np.random.randint(low=0, high=(m - 1), size=10, dtype=np.uint64),
np.random.randint(low=1, high=5, size=10)))
kwargs = {'a': 'b', 'c': 1}
kwargs2 = {'a': 'method', 'c': 1}
Expand All @@ -396,7 +408,7 @@ def test_save_and_load(tmpdir_factory, blank_noise_offsets):
posterior._noise_count_regularized_posterior_coo = posterior_coo2
posterior._noise_count_regularized_posterior_kwargs = kwargs2
posterior._latents = {'p': np.random.randn(100), 'd': np.random.randn(100)}
posterior.index_converter = IndexConverter(total_n_cells=1000, total_n_genes=1000)
posterior.index_converter = IndexConverter(total_n_cells=max(1000, m // 1000 + 1), total_n_genes=1000)

# save
posterior.save(file=str(filename))
Expand All @@ -406,9 +418,14 @@ def test_save_and_load(tmpdir_factory, blank_noise_offsets):
posterior2.load(file=str(filename))

# check
for attr in ['_noise_count_posterior_coo', '_noise_count_posterior_coo_offsets',
'_noise_count_posterior_kwargs', '_noise_count_regularized_posterior_coo',
'_noise_count_regularized_posterior_kwargs', '_latents']:
for attr in [
'_noise_count_posterior_coo',
'_noise_count_posterior_coo_offsets',
'_noise_count_posterior_kwargs',
'_noise_count_regularized_posterior_coo',
'_noise_count_regularized_posterior_kwargs',
'_latents'
]:
val1 = getattr(posterior, attr)
val2 = getattr(posterior2, attr)
print(f'{attr} ===================')
Expand Down

0 comments on commit 12b4758

Please sign in to comment.