Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix posterior and estimator integer overflow bugs on Windows #259

Merged
merged 10 commits into from
Oct 31, 2023
6 changes: 4 additions & 2 deletions .github/workflows/run_pytest.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Run cellbender's tests
# Run cellbender test suite

name: 'pytest'

Expand All @@ -7,11 +7,13 @@ on: pull_request
jobs:
build:

runs-on: 'ubuntu-latest'
strategy:
matrix:
os: ['ubuntu-latest', 'windows-latest']
python-version: ['3.7']

runs-on: ${{ matrix.os }}

steps:
- name: 'Checkout repo'
uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion cellbender/remove_background/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def make_tarball(files: List[str], tarball_name: str) -> bool:
for file in files:
# without arcname, unpacking results in unpredictable file locations!
tar.add(file, arcname=os.path.basename(file))
os.rename(tarball_name + '.tmp', tarball_name)
os.replace(tarball_name + '.tmp', tarball_name)
return True


Expand Down
10 changes: 5 additions & 5 deletions cellbender/remove_background/estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def _estimation_array_to_csr(index_converter,
data: np.ndarray,
m: np.ndarray,
noise_offsets: Optional[Dict[int, int]],
dtype=np.int64) -> sp.csr_matrix:
dtype=np.int) -> sp.csr_matrix:
"""Say you have point estimates for each count matrix element (data) and
you have the 'm'-indices for each value (m). This returns a CSR matrix
that has the shape of the count matrix, where duplicate entries have
Expand All @@ -229,7 +229,7 @@ def _estimation_array_to_csr(index_converter,
a flat format, indexed by 'm'.
m: Array of the same length as data, where each entry is an m-index.
noise_offsets: Noise count offset values keyed by 'm'.
dtype: Data type for sparse matrix. Int32 is too small for 'm' indices.
dtype: Data type for values of sparse matrix

Results:
noise_csr: Noise point estimate, as a CSR sparse matrix.
Expand All @@ -238,7 +238,7 @@ def _estimation_array_to_csr(index_converter,
row, col = index_converter.get_ng_indices(m_inds=m)
if noise_offsets is not None:
data = data + np.array([noise_offsets.get(i, 0) for i in m])
coo = sp.coo_matrix((data.astype(dtype), (row.astype(dtype), col.astype(dtype))),
coo = sp.coo_matrix((data.astype(dtype), (row.astype(np.uint64), col.astype(np.uint8))),
shape=index_converter.matrix_shape, dtype=dtype)
coo.sum_duplicates()
return coo.tocsr()
Expand Down Expand Up @@ -810,7 +810,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
"""
array_length = len(np.unique(noise_log_prob_coo.row))

m = np.zeros(array_length)
m = np.zeros(array_length, dtype=np.uint64)
out = np.zeros(array_length)
a = 0

Expand All @@ -829,7 +829,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
out[a:(a + len_s)] = s.detach().cpu().numpy()
a = a + len_s

return {'m': m.astype(int), 'result': out}
return {'m': m, 'result': out}


def pandas_grouped_apply(coo: sp.coo_matrix,
Expand Down
4 changes: 2 additions & 2 deletions cellbender/remove_background/posterior.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ def _get_cell_noise_count_posterior_coo(
# Put the counts into a sparse csr_matrix.
self._noise_count_posterior_coo = sp.coo_matrix(
(log_probs, (m, c)),
shape=[np.prod(self.count_matrix_shape), n_counts_max],
shape=[np.prod(self.count_matrix_shape, dtype=np.uint64), n_counts_max],
)
noise_offset_dict = dict(zip(m, noise_count_offsets))
nonzero_noise_offset_dict = {k: v for k, v in noise_offset_dict.items() if (v > 0)}
Expand Down Expand Up @@ -1527,7 +1527,7 @@ def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndar
if not ((gene_inds >= 0) & (gene_inds < self.total_n_genes)).all():
raise ValueError(f'Requested gene_inds out of range: '
f'{gene_inds[(gene_inds < 0) | (gene_inds >= self.total_n_genes)]}')
return cell_inds * self.total_n_genes + gene_inds
return cell_inds.astype(np.uint64) * self.total_n_genes + gene_inds.astype(np.uint64)

def get_ng_indices(self, m_inds: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Given a list of 'm' index values, return two arrays: cell index values
Expand Down
24 changes: 22 additions & 2 deletions cellbender/remove_background/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,32 @@
USE_CUDA = torch.cuda.is_available()


def sparse_matrix_equal(mat1: sp.csc_matrix,
mat2: sp.csc_matrix):
def sparse_matrix_equal(mat1, mat2):
"""Fast assertion that sparse matrices are equal"""
if type(mat1) == sp.coo_matrix:
return coo_equal(mat1, mat2)
elif (type(mat1) == sp.csc_matrix) or (type(mat1) == sp.csr_matrix):
return csc_or_csr_equal(mat1, mat2)
else:
raise ValueError(f"Error with test functions: sparse_matrix_equal() was called with a {type(mat1)}")


def csc_or_csr_equal(mat1: sp.csc_matrix,
mat2: sp.csc_matrix):
"""Fast assertion that CSC sparse matrices are equal"""
return (mat1 != mat2).sum() == 0


def coo_equal(mat1: sp.csc_matrix,
mat2: sp.csc_matrix):
"""Fast assertion that COO sparse matrices are equal"""
return (
((mat1.data != mat2.data).sum() == 0)
and ((mat1.row != mat2.row).sum() == 0)
and ((mat1.col != mat2.col).sum() == 0)
)


def tensors_equal(a: torch.Tensor,
b: torch.Tensor):
"""Assertion that torch tensors are equal for each element"""
Expand Down
23 changes: 23 additions & 0 deletions cellbender/remove_background/tests/test_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,29 @@ def log_prob_coo(request, log_prob_coo_base) \
raise ValueError(f'Test writing error: requested "{request.param}" log_prob_coo')


def test_mean_massive_m(log_prob_coo):
"""Sets up a posterior COO with massive m values that are > max(int32).
Will trigger github issue #252 if a bug exists, no assertion necessary.
"""

coo = log_prob_coo['coo']
greater_than_max_int32 = 2200000000
new_row = coo.row.astype(np.int64) + greater_than_max_int32
new_shape = (coo.shape[0] + greater_than_max_int32, coo.shape[1])
new_coo = sp.coo_matrix((coo.data, (new_row, coo.col)),
shape=new_shape)
offset_dict = {k + greater_than_max_int32: v for k, v in log_prob_coo['offsets'].items()}

# this is just a shim
converter = IndexConverter(total_n_cells=2,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a very cool library for "mock"ing classes and methods for unit tests:
https://docs.python.org/3/library/unittest.mock.html

(just for future reference)

total_n_genes=new_coo.shape[0])

# set up and estimate
estimator = Mean(index_converter=converter)
noise_csr = estimator.estimate_noise(noise_log_prob_coo=new_coo,
noise_offsets=offset_dict)


@pytest.fixture(scope='module', params=['exact', 'filtered', 'unsorted'])
def mckp_log_prob_coo(request, log_prob_coo_base) \
-> Dict[str, Union[sp.coo_matrix, np.ndarray, Dict[int, int]]]:
Expand Down
35 changes: 26 additions & 9 deletions cellbender/remove_background/tests/test_posterior.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,23 +368,35 @@ def test_compute_mean_target_removal_as_function(log_prob_coo, fpr, per_gene, cu


@pytest.mark.parametrize('blank_noise_offsets', [False, True], ids=['', 'no_noise_offsets'])
def test_save_and_load(tmpdir_factory, blank_noise_offsets):
@pytest.mark.parametrize('m', [1000, 2200000000], ids=['small', 'big'])
def test_save_and_load(tmpdir_factory, blank_noise_offsets, m):
"""Test that a round trip through save and load gives the same thing"""

tmp_dir = tmpdir_factory.mktemp('posterior')
filename = tmp_dir.join('posterior.h5')

m = 1000
n = 20
num_nonzeros = 1000

posterior = Posterior(dataset_obj=None, vi_model=None) # blank

posterior_coo = sp.random(m, n, density=0.1, format='coo', dtype=float)
posterior_coo2 = sp.random(m, n, density=0.08, format='coo', dtype=float)
# old way that cannot handle large m
# posterior_coo = sp.random(m, n, density=0.1, format='coo', dtype=float)
# posterior_coo2 = sp.random(m, n, density=0.08, format='coo', dtype=float)

m_array = np.random.randint(low=0, high=m, size=num_nonzeros - 1, dtype=np.uint64)
m_array = np.concatenate([m_array, np.array(m - 1, dtype=np.uint64)], axis=None)
n_array = np.random.randint(low=0, high=n, size=num_nonzeros)
val_array = np.random.rand(num_nonzeros) * -10
val_array2 = np.random.rand(num_nonzeros) * -5

posterior_coo = sp.coo_matrix((val_array, (m_array, n_array)), shape=(m, n))
posterior_coo2 = sp.coo_matrix((val_array2, (m_array, n_array)), shape=(m, n))

if blank_noise_offsets:
noise_offsets = {}
else:
noise_offsets = dict(zip(np.random.randint(low=0, high=(m - 1), size=10),
noise_offsets = dict(zip(np.random.randint(low=0, high=(m - 1), size=10, dtype=np.uint64),
np.random.randint(low=1, high=5, size=10)))
kwargs = {'a': 'b', 'c': 1}
kwargs2 = {'a': 'method', 'c': 1}
Expand All @@ -396,7 +408,7 @@ def test_save_and_load(tmpdir_factory, blank_noise_offsets):
posterior._noise_count_regularized_posterior_coo = posterior_coo2
posterior._noise_count_regularized_posterior_kwargs = kwargs2
posterior._latents = {'p': np.random.randn(100), 'd': np.random.randn(100)}
posterior.index_converter = IndexConverter(total_n_cells=1000, total_n_genes=1000)
posterior.index_converter = IndexConverter(total_n_cells=max(1000, m // 1000 + 1), total_n_genes=1000)

# save
posterior.save(file=str(filename))
Expand All @@ -406,9 +418,14 @@ def test_save_and_load(tmpdir_factory, blank_noise_offsets):
posterior2.load(file=str(filename))

# check
for attr in ['_noise_count_posterior_coo', '_noise_count_posterior_coo_offsets',
'_noise_count_posterior_kwargs', '_noise_count_regularized_posterior_coo',
'_noise_count_regularized_posterior_kwargs', '_latents']:
for attr in [
'_noise_count_posterior_coo',
'_noise_count_posterior_coo_offsets',
'_noise_count_posterior_kwargs',
'_noise_count_regularized_posterior_coo',
'_noise_count_regularized_posterior_kwargs',
'_latents'
]:
val1 = getattr(posterior, attr)
val2 = getattr(posterior2, attr)
print(f'{attr} ===================')
Expand Down